api/app/features/conversation/input_normalize.py

"""
聊天输入归一：供访谈 Agent / 编排层对 ASR 与键盘输入做可控预处理（规则 / 可选 LLM）。

不改变 segment 落库原文；仅作为模型侧派生净稿。
与 memoir 共用同一套确定性规则，避免聊天与回忆录对同一句理解割裂。
"""

from __future__ import annotations

import json
import re
from typing import Any

from app.core.config import settings
from app.core.langchain_llm import invoke_json_object
from app.core.logging import get_logger
from app.features.memoir.memoir_images.json_payload import extract_json_payload

logger = get_logger(__name__)

# 口语/ASR 常见同音：「没」误为「美」且与「看上」搭配（避免误伤「美容」「选美」等）
_MEI_KANSHANG_RE = re.compile(r"美(?=看上[我你他她它])")


def apply_conversation_input_rules(text: str) -> str:
    """确定性规则；保守替换，仅覆盖高频误听误打模式。与 memoir 共用。"""
    s = text or ""
    if not s:
        return s
    return _MEI_KANSHANG_RE.sub("没", s)


def _llm_normalize_chat_input(text: str, llm: Any) -> str | None:
    """仅修正明显错字与同音字，不增事实；失败返回 None。"""
    if not llm or not (text or "").strip():
        return None
    max_in = int(settings.chat_input_normalize_llm_max_input_chars)
    t = (text or "").strip()
    if len(t) > max_in:
        logger.debug(
            "event=chat_input_normalize_llm_skip reason=input_too_long len={} max={}",
            len(t),
            max_in,
        )
        return None
    prompt = f"""你是口述转写纠错助手。只修正明显的同音错别字、别字与标点，使句子通顺可读。
禁止增加事实、不补充细节、不摘要、不改写句式风格；不得新增人名、地名、数字、事件。
若原文已通顺或无法确定错误，则照抄输入。

【用户口述】
{t}

**JSON 输出**：只输出一个合法 JSON 对象。
{{"normalized_text": "纠错后的完整文本（与输入等意，仅修错字与标点）"}}

只输出 JSON，不要其它文字。"""
    try:
        raw = invoke_json_object(
            llm,
            prompt,
            max_tokens=int(settings.chat_input_normalize_llm_max_tokens),
            agent="chat_input_normalize.llm",
        )
        data = json.loads(extract_json_payload(raw))
        if not isinstance(data, dict):
            return None
        out = (data.get("normalized_text") or "").strip()
        if not out:
            return None
        return out
    except Exception as e:
        logger.warning("chat_input_normalize LLM 失败，回退规则结果: {}", e)
        return None


def normalize_chat_input_for_agent(text: str, *, llm: Any | None = None) -> str:
    """
    聊天侧单一出口：编排层与 InterviewAgent 共用。

    - 全局关闭：原文
    - off：原文
    - rules：仅规则
    - llm：先规则，再（可选）LLM；无 llm 或失败则保留规则结果
    """
    if not settings.chat_input_normalize_enabled:
        return text or ""
    mode = (settings.chat_input_normalize_mode or "rules").strip().lower()
    if mode == "off":
        return text or ""

    base = apply_conversation_input_rules(text or "")
    if mode != "llm":
        return base

    refined = _llm_normalize_chat_input(base, llm)
    if refined is not None:
        return refined
    return base