2026-03-31 23:55:26 +08:00
|
|
|
|
"""
|
|
|
|
|
|
口述归一:在进入叙事与忠实度校验前,对同一段文本做可控预处理(规则 / 可选 LLM)。
|
|
|
|
|
|
|
|
|
|
|
|
不改变 segment 落库原文;仅作为 memoir story 生成路径的派生输入。
|
|
|
|
|
|
|
|
|
|
|
|
规则层与聊天侧共用 `apply_conversation_input_rules`(见 conversation.input_normalize)。
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
from app.core.config import settings
|
2026-04-01 11:49:33 +08:00
|
|
|
|
from app.core.text_normalize import apply_oral_rules, llm_normalize_text
|
2026-03-31 23:55:26 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _llm_normalize_oral(text: str, llm: Any) -> str | None:
|
|
|
|
|
|
"""仅修正明显错字与同音字,不增事实;失败返回 None。"""
|
2026-04-01 11:49:33 +08:00
|
|
|
|
return llm_normalize_text(
|
|
|
|
|
|
text,
|
|
|
|
|
|
llm,
|
|
|
|
|
|
max_input_chars=int(settings.memoir_oral_normalize_llm_max_input_chars),
|
|
|
|
|
|
max_tokens=int(settings.memoir_oral_normalize_llm_max_tokens),
|
|
|
|
|
|
agent_name="oral_normalize.llm",
|
|
|
|
|
|
)
|
2026-03-31 23:55:26 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_oral_for_memoir(text: str, *, llm: Any | None = None) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
供 story pipeline 单一出口:叙事与忠实度使用同一返回值。
|
|
|
|
|
|
|
|
|
|
|
|
- off / 全局关闭:原文
|
|
|
|
|
|
- rules:仅规则
|
|
|
|
|
|
- rules + LLM 分支:先规则,再(可选)LLM;LLM 失败则保留规则结果
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not settings.memoir_oral_normalize_enabled:
|
|
|
|
|
|
return text or ""
|
|
|
|
|
|
mode = (settings.memoir_oral_normalize_mode or "rules").strip().lower()
|
|
|
|
|
|
if mode == "off":
|
|
|
|
|
|
return text or ""
|
|
|
|
|
|
|
2026-04-01 11:49:33 +08:00
|
|
|
|
base = apply_oral_rules(text or "")
|
2026-03-31 23:55:26 +08:00
|
|
|
|
if mode != "llm":
|
|
|
|
|
|
return base
|
|
|
|
|
|
|
|
|
|
|
|
refined = _llm_normalize_oral(base, llm)
|
|
|
|
|
|
if refined is not None:
|
|
|
|
|
|
return refined
|
|
|
|
|
|
return base
|