Files
life-echo/api/app/core/text_normalize.py
Kevin 53d9e003af feat(api): 叙事 prompt、职业上下文、读路径章节、WS 解耦与错误脱敏
- 回忆录:事实边界补充允许清单;传记文体示例与 JSON 叙事要求对齐
- default 职业提示 occupation_context;cadre/military 退休语境
- GET 章节读路径零写入,prepare_chapter_read_view + markdown_for_response
- 文本归一抽到 core/text_normalize;移除弃用 reply 策略与 recompose_chapters_for_story
- ConversationService:WS 连接/用户段落/结束对话;对外错误固定文案
- 测试:HTTP 脱敏契约、章节读视图、occupation 与 background_voice
2026-04-01 11:55:52 +08:00

73 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""口述/聊天输入的确定性规则与可选 LLM 纠错(供 conversation 与 memoir 共用)。"""
from __future__ import annotations
import json
import re
from typing import Any
from app.core.langchain_llm import invoke_json_object
from app.core.logging import get_logger
from app.features.memoir.memoir_images.json_payload import extract_json_payload
logger = get_logger(__name__)
_MEI_KANSHANG_RE = re.compile(r"美(?=看上[我你他她它])")
def apply_oral_rules(text: str) -> str:
"""确定性规则;保守替换,仅覆盖高频误听误打模式。"""
s = text or ""
if not s:
return s
return _MEI_KANSHANG_RE.sub("", s)
def llm_normalize_text(
text: str,
llm: Any,
*,
max_input_chars: int,
max_tokens: int,
agent_name: str,
) -> str | None:
"""仅修正明显错字与同音字,不增事实;失败返回 None。"""
if not llm or not (text or "").strip():
return None
t = (text or "").strip()
if len(t) > max_input_chars:
logger.debug(
"event=llm_text_normalize_skip reason=input_too_long len={} max={}",
len(t),
max_input_chars,
)
return None
prompt = f"""你是口述转写纠错助手。只修正明显的同音错别字、别字与标点,使句子通顺可读。
禁止增加事实、不补充细节、不摘要、不改写句式风格;不得新增人名、地名、数字、事件。
若原文已通顺或无法确定错误,则照抄输入。
【用户口述】
{t}
**JSON 输出**:只输出一个合法 JSON 对象。
{{"normalized_text": "纠错后的完整文本(与输入等意,仅修错字与标点)"}}
只输出 JSON不要其它文字。"""
try:
raw = invoke_json_object(
llm,
prompt,
max_tokens=max_tokens,
agent=agent_name,
)
data = json.loads(extract_json_payload(raw))
if not isinstance(data, dict):
return None
out = (data.get("normalized_text") or "").strip()
if not out:
return None
return out
except Exception as e:
logger.warning("llm_normalize_text 失败 {}: {}", agent_name, e)
return None