diff --git a/api/app/agents/chat/background_voice.py b/api/app/agents/chat/background_voice.py index ffba063..01bc672 100644 --- a/api/app/agents/chat/background_voice.py +++ b/api/app/agents/chat/background_voice.py @@ -94,14 +94,16 @@ def get_background_voice_chat_block(voice: str | None) -> str: "## 背景语气:军队语境(仅语气,不编造事实)\n" "称呼得体、句子简洁利落、条理清楚;避免网络梗与油滑套话。\n" "先简短接住对方,再**最多一个**具体问题;不写命令式、不做思想政治表态。\n" - "涉及纪律、集体、任务等措辞,**仅当用户口述已出现相关事实时**自然呼应,禁止堆砌军事化辞藻或虚构经历。" + "涉及纪律、集体、任务等措辞,**仅当用户口述已出现相关事实时**自然呼应,禁止堆砌军事化辞藻或虚构经历。\n" + "用户已退役/转业,以回忆军旅岁月为基调,不要预设其仍在服役。" ) # cadre return ( "## 背景语气:干部/机关语境(仅语气,不编造事实)\n" "稳重、有分寸,敬语适度;句子可略完整,但仍控制总字数,避免官样文章与排比空话。\n" "先回应对方内容,再**最多一个**具体问题;不写公文套话、不做政治评价。\n" - "涉及职务与组织时,**不得编造**用户未提及的职级、单位与荣誉。" + "涉及职务与组织时,**不得编造**用户未提及的职级、单位与荣誉。\n" + "用户已退休,以回顾和怀念工作岁月为基调,不要预设其仍在岗。" ) @@ -114,10 +116,12 @@ def get_background_voice_narrative_block(voice: str | None) -> str: return ( "## 背景文体(军队,须遵守上文事实边界)\n" "叙事紧凑、层次清楚;若口述已出现纪律、集体、任务等语境,可适度用书面语呼应,**禁止**堆砌口号式军事辞藻或虚构军旅细节。\n" - "不新增军衔、单位番号、表彰等口述未出现的信息。" + "不新增军衔、单位番号、表彰等口述未出现的信息。\n" + "用户已退役/转业,以回忆军旅岁月为基调,不要预设其仍在服役。" ) return ( "## 背景文体(干部/机关,须遵守上文事实边界)\n" "段落层次清晰,用语庄重自然,避免口语碎词与段子感;**不得编造**职务、荣誉、单位名称与组织细节。\n" - "文采服务于真实内容,不写成公文或汇报腔。" + "文采服务于真实内容,不写成公文或汇报腔。\n" + "用户已退休,以回顾和怀念工作岁月为基调,不要预设其仍在岗。" ) diff --git a/api/app/agents/chat/interview_agent.py b/api/app/agents/chat/interview_agent.py index eab281e..0ecbb54 100644 --- a/api/app/agents/chat/interview_agent.py +++ b/api/app/agents/chat/interview_agent.py @@ -98,6 +98,7 @@ class InterviewAgent: memory_evidence_text: str = "", background_voice: str = "default", normalized_user_message: Optional[str] = None, + occupation: str = "", ) -> AgentChatTurn: """生成状态感知的访谈回复,不持久化(由 Orchestrator 负责)""" if not self.llm: @@ -145,6 +146,7 @@ class InterviewAgent: memory_evidence_text=memory_evidence_text, reply_length_mode=reply_plan.mode.value, background_voice=background_voice, + occupation=occupation, ) history_string = format_history_string(history_messages) full_prompt = f"{system_prompt}\n\n{history_string}\n\nHuman: {text_for_model}\n\nAssistant:" @@ -199,6 +201,7 @@ class InterviewAgent: memoir_state: MemoirStateSchema, user_profile_context: str = "", background_voice: str = "default", + occupation: str = "", ) -> List[str]: """生成空对话开场白,不持久化(由 Orchestrator 负责)""" if not self.llm: @@ -213,6 +216,7 @@ class InterviewAgent: user_profile_context=user_profile_context, persona=persona, background_voice=background_voice, + occupation=occupation, ) full_prompt = f"{prompt}\n\nAssistant:" log_agent_payload(logger, "InterviewAgent.opening.prompt", full_prompt) diff --git a/api/app/agents/chat/interview_reply_length.py b/api/app/agents/chat/interview_reply_length.py index 27ba283..ed196d8 100644 --- a/api/app/agents/chat/interview_reply_length.py +++ b/api/app/agents/chat/interview_reply_length.py @@ -309,49 +309,3 @@ def bump_reply_plan_for_background_voice( likely_chit_chat=plan.likely_chit_chat, information_rich=plan.information_rich, ) - - -# 向后兼容:旧名与旧签名(仅测试或外部引用) -def compute_reply_length_strategy( - user_message_len: int, - *, - likely_new_detail: bool, - likely_chit_chat: bool, - settings: "Settings", -) -> ReplyPlan: - """已弃用:请用 compute_reply_plan(user_message, ...)。保留供过渡期。""" - # 无法还原 information_rich,按旧逻辑近似 - n = max(0, int(user_message_len)) - max_segments = int(settings.chat_interview_max_segments) - if n <= _LEN_BRIEF_MAX: - mode = ReplyLengthMode.brief - elif n < _LEN_LONG_MIN: - mode = ReplyLengthMode.standard - else: - if likely_chit_chat: - mode = ReplyLengthMode.standard - elif likely_new_detail: - mode = ReplyLengthMode.expanded - else: - mode = ReplyLengthMode.standard - return _plan_from_mode( - mode, - max_segments=max_segments, - settings=settings, - background_voice=None, - likely_new=likely_new_detail, - likely_chit=likely_chit_chat, - info_rich=False, - ) - - -def bump_reply_length_strategy_for_background_voice( - plan: ReplyPlan, - *, - background_voice: str | None, - settings: "Settings", -) -> ReplyPlan: - """旧名兼容。""" - return bump_reply_plan_for_background_voice( - plan, background_voice=background_voice, settings=settings - ) diff --git a/api/app/agents/chat/occupation_context.py b/api/app/agents/chat/occupation_context.py new file mode 100644 index 0000000..4321dea --- /dev/null +++ b/api/app/agents/chat/occupation_context.py @@ -0,0 +1,35 @@ +"""default 路径下根据用户职业自由文本注入轻量提示(与 cadre/military 专属块正交)。""" + +from __future__ import annotations + +from app.agents.chat.background_voice import normalize_background_voice + + +def get_occupation_chat_hint(occupation: str | None, background_voice: str) -> str: + """default 路径的通用职业上下文;cadre/military 已有专属块,返回空串。""" + if normalize_background_voice(background_voice) != "default": + return "" + occ = (occupation or "").strip() + if not occ: + return "" + return ( + f"## 用户职业背景\n" + f"用户从事过「{occ}」相关工作。聊天时自然贴合这一背景," + f"在用语和追问方向上适度靠近用户的职业经历与知识面,但不要刻意。" + ) + + +def get_occupation_narrative_hint(occupation: str | None, background_voice: str) -> str: + """default 路径的叙事职业上下文。""" + if normalize_background_voice(background_voice) != "default": + return "" + occ = (occupation or "").strip() + if not occ: + return "" + return ( + f"## 用户职业背景(仅供文体微调,须遵守事实边界)\n" + f"用户从事过「{occ}」相关工作。叙事时可适度使用该领域常见的书面用语," + f"让正文贴近讲述者的身份感。\n" + f"**禁止**因职业背景而补充口述未出现的岗位职责、行业流程、单位层级或专业术语细节;" + f"只有当用户口述里已出现相关表达时,才可顺势书面化。" + ) diff --git a/api/app/agents/chat/orchestrator.py b/api/app/agents/chat/orchestrator.py index 4b452ba..21c03cb 100644 --- a/api/app/agents/chat/orchestrator.py +++ b/api/app/agents/chat/orchestrator.py @@ -203,6 +203,7 @@ class ChatOrchestrator: user_profile_context = "" background_voice = "default" + occupation = "" if user: user_profile_context = format_user_profile_context( birth_year=user.birth_year, @@ -211,6 +212,7 @@ class ChatOrchestrator: occupation=user.occupation, ) background_voice = infer_background_voice(user.occupation) + occupation = user.occupation or "" memory_evidence_text = await _fetch_interview_memory_evidence( db, user_id, normalized_user_message @@ -225,6 +227,7 @@ class ChatOrchestrator: memory_evidence_text=memory_evidence_text, background_voice=background_voice, normalized_user_message=normalized_user_message, + occupation=occupation, ) if agent_summary_enabled(): logger.info( @@ -297,6 +300,7 @@ class ChatOrchestrator: memory_evidence_text: str = "", background_voice: str = "default", normalized_user_message: str | None = None, + occupation: str = "", ) -> AgentChatTurn: """委托 InterviewAgent 生成访谈回复(持久化由调用方负责)。""" return await self.interview_agent.generate_response_with_state( @@ -308,6 +312,7 @@ class ChatOrchestrator: memory_evidence_text=memory_evidence_text, background_voice=background_voice, normalized_user_message=normalized_user_message, + occupation=occupation, ) def detect_user_stage(self, user_message: str) -> str: @@ -320,6 +325,7 @@ class ChatOrchestrator: memoir_state: MemoirStateSchema, user_profile_context: str = "", background_voice: str = "default", + occupation: str = "", ) -> List[str]: """ 委托 InterviewAgent 生成访谈开场白(持久化由调用方 ConversationHistoryStore 负责)。 @@ -329,4 +335,5 @@ class ChatOrchestrator: memoir_state=memoir_state, user_profile_context=user_profile_context, background_voice=background_voice, + occupation=occupation, ) diff --git a/api/app/agents/chat/prompts_conversation.py b/api/app/agents/chat/prompts_conversation.py index 8eb1795..e916c81 100644 --- a/api/app/agents/chat/prompts_conversation.py +++ b/api/app/agents/chat/prompts_conversation.py @@ -9,6 +9,7 @@ from app.agents.chat.background_voice import ( get_background_voice_chat_block, normalize_background_voice, ) +from app.agents.chat.occupation_context import get_occupation_chat_hint from app.agents.chat.interview_reply_length import ( heuristic_likely_chit_chat, heuristic_likely_emotional, @@ -175,6 +176,7 @@ def get_opening_prompt( user_profile_context: str = "", persona: str = "default", background_voice: str = "default", + occupation: str = "", ) -> str: """空对话时 AI 先开口的提示词""" stage_name_map = { @@ -262,6 +264,8 @@ def get_opening_prompt( persona_extra = f"\n## 访谈性格\n{opening_persona}\n" if opening_persona else "" voice_block = get_background_voice_chat_block(background_voice) voice_section = f"\n{voice_block}\n" if voice_block else "" + occ_hint = get_occupation_chat_hint(occupation, background_voice) + occ_section = f"\n{occ_hint}\n" if occ_hint else "" bv = normalize_background_voice(background_voice) if bv == "default": opening_head = ( @@ -276,7 +280,7 @@ def get_opening_prompt( return f"""{opening_head} {profile_section} {topics_heading} -{persona_extra}{voice_section} +{persona_extra}{voice_section}{occ_section} ## 任务 1. 简短问候。 {task_question} @@ -380,6 +384,7 @@ def get_guided_conversation_prompt( memory_evidence_text: str = "", reply_length_mode: str = "standard", background_voice: str = "default", + occupation: str = "", ) -> str: """生成状态感知的对话提示词(档位由 Agent 计算的 ReplyPlan 传入,不在此重复推导)。""" persona_key = normalize_interview_persona(persona) @@ -539,13 +544,15 @@ def get_guided_conversation_prompt( voice_block = get_background_voice_chat_block(background_voice) voice_section = f"\n{voice_block}\n" if voice_block else "" + occ_hint = get_occupation_chat_hint(occupation, background_voice) + occ_section = f"\n{occ_hint}\n" if occ_hint else "" intro_line = _guided_voice_intro_line(background_voice) prompt = f"""{intro_line} {topic_desc} {reply_length_section} {profile_section} -{voice_section} +{voice_section}{occ_section} ## 本阶段已聊 {filled_slots_str} diff --git a/api/app/agents/memoir/narrative_agent.py b/api/app/agents/memoir/narrative_agent.py index 2cf17cf..e3e1bbc 100644 --- a/api/app/agents/memoir/narrative_agent.py +++ b/api/app/agents/memoir/narrative_agent.py @@ -68,6 +68,7 @@ class NarrativeAgent: birth_year: Optional[int] = None, llm: Any = None, background_voice: str = "default", + occupation: str = "", ) -> str: """将新对话改写为叙述。若无 LLM 则直接拼接。 @@ -88,6 +89,7 @@ class NarrativeAgent: user_profile=user_profile, birth_year=birth_year, background_voice=background_voice, + occupation=occupation, ) max_tokens = 8192 agent_name = "NarrativeAgent.generate_narrative_merge" @@ -100,6 +102,7 @@ class NarrativeAgent: user_profile=user_profile, birth_year=birth_year, background_voice=background_voice, + occupation=occupation, ) max_tokens = 4096 agent_name = "NarrativeAgent.generate_narrative" diff --git a/api/app/agents/memoir/prompts.py b/api/app/agents/memoir/prompts.py index dfbaf60..9546451 100644 --- a/api/app/agents/memoir/prompts.py +++ b/api/app/agents/memoir/prompts.py @@ -7,6 +7,7 @@ import re from typing import Optional from app.agents.chat.background_voice import get_background_voice_narrative_block +from app.agents.chat.occupation_context import get_occupation_narrative_hint from app.features.memory.evidence_format import ( dedupe_evidence_chunk_rows, format_evidence_chunks_for_prompt, @@ -142,7 +143,14 @@ def _memoir_fidelity_core_rules() -> str: 1. **正文只能展开「本段用户口述」区块中的内容**。若输入中有「相关记忆摘录」等参考区,其中信息**不得**写成本人本轮亲口经历的细节;最多用一两句作主题衔接,且不得引入摘录里才有的具体人名、地点、时间、对话、数字。 2. **禁止编造**:不得新增用户未提及的具体人物姓名、对话原文、地点、时间、事件经过、因果、数字;不得推断性心理描写或「典型年代场景」填充。**口述未明确结果、结局或对方最终决定时**,不得用常识补全为确定断言(例如未清楚表达落选、未通过、被拒绝等,则不得写「未能被选中」「最终没有录用」等);只写已明确的过程与事实,不确定处宁可略写或使用中性表述。 3. **禁止为凑字数扩写**:材料短则输出短;段落数量与长度随材料而定。 -4. 允许:去除口语赘词与寒暄、调整语序、合并重复指代、把口语改为书面语;**不得**用虚构细节「让文章更好看」。""" +4. 允许:去除口语赘词与寒暄、调整语序、合并重复指代、把口语改为书面语;**不得**用虚构细节「让文章更好看」。 + +## 以下操作是鼓励的(不算编造) +- 口语转书面语:删语气词、用成语/四字词替换口语表达、调整语序 +- 过渡句与衔接句:如「那段日子」「回想起来」等,只要不引入新的实体 +- 基于口述已有情感的书面化渲染(如口述说「难受」,可改为「心里不好受」)——前提是不新增具体场景、数字、动作 +- 合并同义重复表述,让叙述更紧凑 +- 纠正明显的语音识别错字""" def _memoir_fidelity_user_profile_rules() -> str: @@ -185,7 +193,13 @@ def _memoir_editor_narrative_style_block() -> str: - 保留生动的细节,将口语表达改写为有画面感的书面叙述 - 去除口语中的填充词和无意义重复 - 保持时间顺序和逻辑清晰 -- **文采服务于真实**:可以有文学性的表达与恰当的情感渲染,但不得虚构新的事实来增色 +- **在事实边界内,鼓励使用有温度的传记笔法**,让读者感受到讲述者当时的心情;可有文学性的表达与恰当的情感渲染,但不得虚构新的事实来增色 + +### 示例(仅供参考允许的改写程度;只改语气、不加新事实) +- 原文:「那时候穷啊,一家人挤一间房。」 + → 改写:「那时家里拮据,一家人挤在一间屋里过日子。」 +- 原文:「后来他走了,我挺难受的。」 + → 改写:「他走后的那段日子,心里一直不是滋味。」 ### 输出格式约束 - 使用第一人称 @@ -193,12 +207,17 @@ def _memoir_editor_narrative_style_block() -> str: - 如有「衔接上下文」,仅保持语气与时间线连贯,不重复已有段落全文""" -def get_narrative_editor_system_prompt(background_voice: str = "default") -> str: +def get_narrative_editor_system_prompt( + background_voice: str = "default", occupation: str = "" +) -> str: """故事/章节叙事:传记作家式书面语 + 事实边界(chapter 直接展示 story 时使用)。""" + occ_hint = get_occupation_narrative_hint(occupation, background_voice) tail = get_background_voice_narrative_block(background_voice) base = f"""{get_memoir_fidelity_facts_only_prompt()} {_memoir_editor_narrative_style_block()}""" + if occ_hint: + base = f"{base}\n\n{occ_hint}" if not tail: return base return f"{base}\n\n{tail}" @@ -396,6 +415,7 @@ def get_narrative_prompt( birth_year: Optional[int] = None, archived_summaries: str = "", background_voice: str = "default", + occupation: str = "", ) -> str: """将新对话改写为叙述(只输出新内容的改写,不重复已有内容)""" context_tail = "" @@ -418,7 +438,7 @@ def get_narrative_prompt( age_hint = _build_age_hint(stage, birth_year) time_section = f"\n时间参考:{age_hint}" if age_hint else "" - return f"""{get_narrative_editor_system_prompt(background_voice=background_voice)} + return f"""{get_narrative_editor_system_prompt(background_voice=background_voice, occupation=occupation)} 阶段:{stage} 可用信息(slots,仅可复述其中已出现事实):{slots}{profile_section}{time_section} @@ -449,6 +469,7 @@ def get_narrative_json_prompt( user_profile: str = "", birth_year: Optional[int] = None, background_voice: str = "default", + occupation: str = "", ) -> str: """将新对话改写为叙述,输出 JSON 格式(paragraphs: [{content, image_description}])""" context_tail = "" @@ -465,7 +486,7 @@ def get_narrative_json_prompt( age_hint = _build_age_hint(stage, birth_year) time_section = f"\n时间参考:{age_hint}" if age_hint else "" - return f"""{get_narrative_editor_system_prompt(background_voice=background_voice)} + return f"""{get_narrative_editor_system_prompt(background_voice=background_voice, occupation=occupation)} 请将「本段用户口述」改写为第一人称书面叙述,并输出 **纯 JSON**,不要包含任何其他文字或 markdown 代码块。 **JSON 输出**:接口已启用 `response_format=json_object`(与 DeepSeek JSON 模式一致),只输出一个合法 JSON 对象。 @@ -481,7 +502,7 @@ def get_narrative_json_prompt( 1. **只展开「本段用户口述」**;若有参考摘录区,不得把摘录中的具体事实写成本轮亲历经历(见系统说明)。 2. 过滤语气词、寒暄、与 AI 的交互;不重复已有故事全文;本批只写同一主题/事件链。 3. 段落数量与每段长度**随材料而定**,禁止为凑字数编造。 -4. 使用第一人称、**优雅书面语**(可适当过渡与铺陈,须基于口述事实);不要直接引用原话;不要用 `#`、`##`、表格。 +4. 使用第一人称、**优雅书面语**,改写须符合系统说明中的「传记作家文体」与「改写原则」(含示例):在事实边界内可做书面化、过渡与情感渲染,**须基于口述事实**;不要直接引用原话;不要用 `#`、`##`、表格。 5. **不推断结局**:若用户未明确说结果(是否录取、是否被选中等),不要凭常识补全为确定结论;只复述已说清楚的内容。 ## 输出格式(严格 JSON) @@ -527,6 +548,7 @@ def get_narrative_merge_json_prompt( user_profile: str = "", birth_year: Optional[int] = None, background_voice: str = "default", + occupation: str = "", ) -> str: """ 已有故事追加:将「已有全文(或节选)」与「本段口述」合并为**一篇**第一人称叙述, @@ -542,7 +564,7 @@ def get_narrative_merge_json_prompt( age_hint = _build_age_hint(stage, birth_year) time_section = f"\n时间参考:{age_hint}" if age_hint else "" - return f"""{get_narrative_editor_system_prompt(background_voice=background_voice)} + return f"""{get_narrative_editor_system_prompt(background_voice=background_voice, occupation=occupation)} 你正在**扩写并重组**一则已有回忆录故事:必须把「已有故事」中的事实全部保留在输出中(可合并重复表述、调整语序),并融入「本段用户口述」中的新事实;按**事件发生的时间顺序**排列段落(早→晚);禁止丢弃未矛盾的旧内容。 @@ -559,7 +581,7 @@ def get_narrative_merge_json_prompt( 1. 输出为**完整故事正文**(不是仅写本段):`paragraphs` 须包含重组后的**全文**。 2. **禁止编造**:不得新增用户未在「已有」或「本段」中出现的人名、地点、时间、对话、数字。 3. 若本段与旧文完全重复或无新信息,可仅输出与旧文等价重组后的正文(不得无故缩短到明显少于旧文)。 -4. 使用第一人称、**优雅书面语**(与系统说明中的传记作家文体一致);不要用 `#`、`##`、表格。 +4. 使用第一人称、**优雅书面语**,改写须符合系统说明中的「传记作家文体」与「改写原则」(含示例):在事实边界内可做书面化、过渡与情感渲染,**须基于口述与旧文已有事实**;不要用 `#`、`##`、表格。 5. **不推断结局**:本段口述未明确结果时,不要用常识补全落选/未通过等确定说法,除非旧文中已有同一事实。 ## 输出格式(严格 JSON) diff --git a/api/app/core/text_normalize.py b/api/app/core/text_normalize.py new file mode 100644 index 0000000..5ba6a03 --- /dev/null +++ b/api/app/core/text_normalize.py @@ -0,0 +1,72 @@ +"""口述/聊天输入的确定性规则与可选 LLM 纠错(供 conversation 与 memoir 共用)。""" + +from __future__ import annotations + +import json +import re +from typing import Any + +from app.core.langchain_llm import invoke_json_object +from app.core.logging import get_logger +from app.features.memoir.memoir_images.json_payload import extract_json_payload + +logger = get_logger(__name__) + +_MEI_KANSHANG_RE = re.compile(r"美(?=看上[我你他她它])") + + +def apply_oral_rules(text: str) -> str: + """确定性规则;保守替换,仅覆盖高频误听误打模式。""" + s = text or "" + if not s: + return s + return _MEI_KANSHANG_RE.sub("没", s) + + +def llm_normalize_text( + text: str, + llm: Any, + *, + max_input_chars: int, + max_tokens: int, + agent_name: str, +) -> str | None: + """仅修正明显错字与同音字,不增事实;失败返回 None。""" + if not llm or not (text or "").strip(): + return None + t = (text or "").strip() + if len(t) > max_input_chars: + logger.debug( + "event=llm_text_normalize_skip reason=input_too_long len={} max={}", + len(t), + max_input_chars, + ) + return None + prompt = f"""你是口述转写纠错助手。只修正明显的同音错别字、别字与标点,使句子通顺可读。 +禁止增加事实、不补充细节、不摘要、不改写句式风格;不得新增人名、地名、数字、事件。 +若原文已通顺或无法确定错误,则照抄输入。 + +【用户口述】 +{t} + +**JSON 输出**:只输出一个合法 JSON 对象。 +{{"normalized_text": "纠错后的完整文本(与输入等意,仅修错字与标点)"}} + +只输出 JSON,不要其它文字。""" + try: + raw = invoke_json_object( + llm, + prompt, + max_tokens=max_tokens, + agent=agent_name, + ) + data = json.loads(extract_json_payload(raw)) + if not isinstance(data, dict): + return None + out = (data.get("normalized_text") or "").strip() + if not out: + return None + return out + except Exception as e: + logger.warning("llm_normalize_text 失败 {}: {}", agent_name, e) + return None diff --git a/api/app/features/auth/router.py b/api/app/features/auth/router.py index 116faf2..32c2f75 100644 --- a/api/app/features/auth/router.py +++ b/api/app/features/auth/router.py @@ -313,11 +313,10 @@ async def upload_avatar( except HTTPException: raise except Exception as e: - error_msg = f"处理图片失败: {str(e)}" - logger.exception("头像上传失败: {}", error_msg) + logger.exception("头像上传失败: {}", e) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=error_msg, + detail="处理图片失败,请重试", ) from e diff --git a/api/app/features/conversation/input_normalize.py b/api/app/features/conversation/input_normalize.py index 907640c..009567f 100644 --- a/api/app/features/conversation/input_normalize.py +++ b/api/app/features/conversation/input_normalize.py @@ -7,70 +7,26 @@ from __future__ import annotations -import json -import re from typing import Any from app.core.config import settings -from app.core.langchain_llm import invoke_json_object +from app.core.text_normalize import apply_oral_rules, llm_normalize_text from app.core.logging import get_logger -from app.features.memoir.memoir_images.json_payload import extract_json_payload logger = get_logger(__name__) -# 口语/ASR 常见同音:「没」误为「美」且与「看上」搭配(避免误伤「美容」「选美」等) -_MEI_KANSHANG_RE = re.compile(r"美(?=看上[我你他她它])") - - -def apply_conversation_input_rules(text: str) -> str: - """确定性规则;保守替换,仅覆盖高频误听误打模式。与 memoir 共用。""" - s = text or "" - if not s: - return s - return _MEI_KANSHANG_RE.sub("没", s) +apply_conversation_input_rules = apply_oral_rules def _llm_normalize_chat_input(text: str, llm: Any) -> str | None: """仅修正明显错字与同音字,不增事实;失败返回 None。""" - if not llm or not (text or "").strip(): - return None - max_in = int(settings.chat_input_normalize_llm_max_input_chars) - t = (text or "").strip() - if len(t) > max_in: - logger.debug( - "event=chat_input_normalize_llm_skip reason=input_too_long len={} max={}", - len(t), - max_in, - ) - return None - prompt = f"""你是口述转写纠错助手。只修正明显的同音错别字、别字与标点,使句子通顺可读。 -禁止增加事实、不补充细节、不摘要、不改写句式风格;不得新增人名、地名、数字、事件。 -若原文已通顺或无法确定错误,则照抄输入。 - -【用户口述】 -{t} - -**JSON 输出**:只输出一个合法 JSON 对象。 -{{"normalized_text": "纠错后的完整文本(与输入等意,仅修错字与标点)"}} - -只输出 JSON,不要其它文字。""" - try: - raw = invoke_json_object( - llm, - prompt, - max_tokens=int(settings.chat_input_normalize_llm_max_tokens), - agent="chat_input_normalize.llm", - ) - data = json.loads(extract_json_payload(raw)) - if not isinstance(data, dict): - return None - out = (data.get("normalized_text") or "").strip() - if not out: - return None - return out - except Exception as e: - logger.warning("chat_input_normalize LLM 失败,回退规则结果: {}", e) - return None + return llm_normalize_text( + text, + llm, + max_input_chars=int(settings.chat_input_normalize_llm_max_input_chars), + max_tokens=int(settings.chat_input_normalize_llm_max_tokens), + agent_name="chat_input_normalize.llm", + ) def normalize_chat_input_for_agent(text: str, *, llm: Any | None = None) -> str: diff --git a/api/app/features/conversation/service.py b/api/app/features/conversation/service.py index 2f6e939..6ca3816 100644 --- a/api/app/features/conversation/service.py +++ b/api/app/features/conversation/service.py @@ -15,7 +15,7 @@ from app.core.logging import get_logger from app.core.redis import redis_service from app.core.storage_purge import delete_object_storage_keys_best_effort from app.features.conversation import repo -from app.features.conversation.models import Conversation +from app.features.conversation.models import Conversation, Segment from app.features.conversation.session_history import ( conversation_messages_to_redis_history, ) @@ -108,11 +108,61 @@ class ConversationService: self._quota = quota_service self._object_storage = object_storage + async def ensure_ws_connection( + self, conversation_id: str, user_id: str + ) -> tuple[Conversation | None, str]: + """ + WebSocket:加载或创建对话。返回 (conversation, err)。 + err 为空表示成功;否则为 forbidden | deleted。 + """ + conv = await self._db.get(Conversation, conversation_id) + if not conv: + conv = Conversation( + id=conversation_id, + user_id=user_id, + started_at=datetime.now(timezone.utc), + status="active", + ) + self._db.add(conv) + await self._db.commit() + await self._db.refresh(conv) + return conv, "" + if conv.user_id != user_id: + return None, "forbidden" + if conv.deleted_at is not None: + return None, "deleted" + return conv, "" + + async def create_user_segment( + self, + conversation: Conversation, + user_id: str, + text: str, + *, + audio_url: str | None = None, + audio_duration_seconds: int | None = None, + ) -> Segment: + if conversation.user_id != user_id: + raise ValueError("conversation ownership mismatch") + segment = Segment( + id=str(uuid.uuid4()), + conversation_id=conversation.id, + user_input_text=text, + audio_url=audio_url, + audio_duration_seconds=audio_duration_seconds, + processed=False, + ) + self._db.add(segment) + conversation.last_message_at = datetime.now(timezone.utc) + await self._db.commit() + await self._db.refresh(segment) + return segment + async def _clear_history(self, conversation_id: str) -> None: try: await redis_service.clear_conversation_history(conversation_id) - except Exception: - pass + except Exception as e: + logger.debug("清空会话历史失败: {}", e) async def ensure_redis_history_from_db(self, conversation_id: str) -> list[dict]: """ diff --git a/api/app/features/conversation/ws/router.py b/api/app/features/conversation/ws/router.py index 4a3ef18..c7ff776 100644 --- a/api/app/features/conversation/ws/router.py +++ b/api/app/features/conversation/ws/router.py @@ -18,13 +18,11 @@ from app.core.dependencies import get_asr_provider from app.core.logging import get_logger from app.core.security import verify_token from app.features.conversation.history_store import ConversationHistoryStore -from app.features.conversation.models import Conversation, Segment from app.features.conversation.service import ConversationService from app.features.conversation.ws.connection_manager import manager from app.features.conversation.ws.message_types import MessageType from app.features.conversation.ws.pipeline import ( _delayed_listening_feedback, - _mark_conversation_active, _voice_session_id_from_client_segment_id, background_runner, bump_tts_cancel_epoch, @@ -106,49 +104,41 @@ async def websocket_endpoint( }, ) - conversation = await db.get(Conversation, conversation_id) - if not conversation: - conversation = Conversation( - id=conversation_id, - user_id=user_id, - started_at=datetime.now(timezone.utc), - status="active", + conversation, ws_conn_err = await conversation_service.ensure_ws_connection( + conversation_id, user_id + ) + if ws_conn_err == "forbidden": + try: + await manager.send_message( + conversation_id, + { + "type": MessageType.ERROR, + "data": {"message": "无权访问此对话"}, + "timestamp": datetime.now(timezone.utc).isoformat(), + }, + ) + except Exception: + pass + await websocket.close( + code=status.WS_1008_POLICY_VIOLATION, reason="无权访问此对话" ) - db.add(conversation) - await db.commit() - else: - if conversation.user_id != user_id: - try: - await manager.send_message( - conversation_id, - { - "type": MessageType.ERROR, - "data": {"message": "无权访问此对话"}, - "timestamp": datetime.now(timezone.utc).isoformat(), - }, - ) - except Exception: - pass - await websocket.close( - code=status.WS_1008_POLICY_VIOLATION, reason="无权访问此对话" + return + if ws_conn_err == "deleted": + try: + await manager.send_message( + conversation_id, + { + "type": MessageType.ERROR, + "data": {"message": "对话已删除"}, + "timestamp": datetime.now(timezone.utc).isoformat(), + }, ) - return - if conversation.deleted_at is not None: - try: - await manager.send_message( - conversation_id, - { - "type": MessageType.ERROR, - "data": {"message": "对话已删除"}, - "timestamp": datetime.now(timezone.utc).isoformat(), - }, - ) - except Exception: - pass - await websocket.close( - code=status.WS_1008_POLICY_VIOLATION, reason="对话已删除" - ) - return + except Exception: + pass + await websocket.close( + code=status.WS_1008_POLICY_VIOLATION, reason="对话已删除" + ) + return history = await conversation_service.ensure_redis_history_from_db( conversation_id @@ -205,6 +195,7 @@ async def websocket_endpoint( background_voice=infer_background_voice( user.occupation ), + occupation=user.occupation or "", ) ) ai_msg_id = await ConversationHistoryStore( @@ -291,18 +282,12 @@ async def websocket_endpoint( ) continue - segment = Segment( - id=str(uuid.uuid4()), - conversation_id=conversation_id, - user_input_text=text_message, - processed=False, + segment = await conversation_service.create_user_segment( + conversation, + user_id, + text_message, ) - db.add(segment) - user_message_timestamp = _mark_conversation_active( - conversation - ) - await db.commit() - await db.refresh(segment) + user_message_timestamp = conversation.last_message_at await background_runner.queue_message( conversation.user_id, segment.id, @@ -554,20 +539,16 @@ async def websocket_endpoint( ads = int(audio_duration) except (TypeError, ValueError): ads = 0 - segment = Segment( - id=str(uuid.uuid4()), - conversation_id=conversation_id, - user_input_text=asr_text, - audio_url=f"audio:{audio_duration}s", - audio_duration_seconds=ads if ads > 0 else None, - processed=False, + segment = ( + await conversation_service.create_user_segment( + conversation, + user_id, + asr_text, + audio_url=f"audio:{audio_duration}s", + audio_duration_seconds=ads if ads > 0 else None, + ) ) - db.add(segment) - user_message_timestamp = _mark_conversation_active( - conversation - ) - await db.commit() - await db.refresh(segment) + user_message_timestamp = conversation.last_message_at await background_runner.queue_message( conversation.user_id, segment.id, @@ -606,7 +587,7 @@ async def websocket_endpoint( { "type": MessageType.ERROR, "data": { - "message": f"处理音频消息失败: {str(e)}" + "message": "语音处理失败,请重试或使用文字输入" }, "timestamp": datetime.now( timezone.utc @@ -646,7 +627,7 @@ async def websocket_endpoint( conversation_id, { "type": MessageType.ERROR, - "data": {"message": f"转写失败: {str(e)}"}, + "data": {"message": "语音转写失败,请重试"}, "timestamp": datetime.now(timezone.utc).isoformat(), }, ) @@ -655,9 +636,7 @@ async def websocket_endpoint( bump_tts_cancel_epoch(conversation_id) elif msg_type == MessageType.END_CONVERSATION: - conversation.status = "ended" - conversation.ended_at = datetime.now(timezone.utc) - await db.commit() + await conversation_service.end(conversation_id, user_id) await process_conversation_segments( conversation_id, db, quota_service @@ -715,7 +694,7 @@ async def websocket_endpoint( conversation_id, { "type": MessageType.ERROR, - "data": {"message": str(e)}, + "data": {"message": "处理失败,请重试"}, "timestamp": datetime.now( timezone.utc ).isoformat(), @@ -739,7 +718,7 @@ async def websocket_endpoint( conversation_id, { "type": MessageType.ERROR, - "data": {"message": str(e)}, + "data": {"message": "处理失败,请重试"}, "timestamp": datetime.now(timezone.utc).isoformat(), }, ) diff --git a/api/app/features/memoir/helpers.py b/api/app/features/memoir/helpers.py index f94c93b..f048d30 100644 --- a/api/app/features/memoir/helpers.py +++ b/api/app/features/memoir/helpers.py @@ -95,6 +95,8 @@ def chapter_cover_to_dict( if not chapter_eligible_for_cover_by_inline_body_image_count(ch): return None m = primary_chapter_memoir_image(ch) + if m and is_image_permanently_unavailable(m): + m = None if m: return memoir_image_to_dict(m) asset_url_map = asset_url_map or {} @@ -120,8 +122,10 @@ def chapter_cover_to_dict( return None -def _chapter_markdown(ch: Chapter) -> str: - """正文真源:canonical_markdown。""" +def _chapter_markdown(ch: Chapter, *, override: str | None = None) -> str: + """正文:优先读路径临时物化串,否则 canonical_markdown。""" + if override is not None and str(override).strip(): + return str(override).strip() md = getattr(ch, "canonical_markdown", None) if md and str(md).strip(): return str(md).strip() @@ -129,12 +133,15 @@ def _chapter_markdown(ch: Chapter) -> str: def chapter_to_list_dict( - ch: Chapter, asset_url_map: dict[str, str] | None = None + ch: Chapter, + asset_url_map: dict[str, str] | None = None, + *, + markdown_for_response: str | None = None, ) -> dict: """列表视图:与详情字段对齐的最小子集。""" cover = chapter_cover_to_dict(ch, asset_url_map=asset_url_map) cover_normalized = first_normalized_image_for_api(cover) - canonical_raw = _chapter_markdown(ch) + canonical_raw = _chapter_markdown(ch, override=markdown_for_response) wcount = len(canonical_raw.strip()) if canonical_raw else 0 return { "id": ch.id, @@ -153,7 +160,12 @@ def chapter_to_list_dict( } -def chapter_to_dict(ch: Chapter, asset_url_map: dict[str, str] | None = None) -> dict: +def chapter_to_dict( + ch: Chapter, + asset_url_map: dict[str, str] | None = None, + *, + markdown_for_response: str | None = None, +) -> dict: """详情视图:stories-first 契约。asset_url_map 用于解析 asset:// 与 cover_asset_id。""" asset_url_map = asset_url_map or {} resolve = lambda aid: asset_url_map.get(aid) # noqa: E731 @@ -161,7 +173,7 @@ def chapter_to_dict(ch: Chapter, asset_url_map: dict[str, str] | None = None) -> cover = chapter_cover_to_dict(ch, asset_url_map=asset_url_map) cover_normalized = first_normalized_image_for_api(cover) # 正文真源:优先 canonical_markdown - canonical_md = _chapter_markdown(ch) + canonical_md = _chapter_markdown(ch, override=markdown_for_response) canonical_md = resolve_asset_refs_in_markdown(canonical_md, resolve) reading_segments = resolve_reading_segments_for_chapter_detail( ch, asset_url_map=asset_url_map diff --git a/api/app/features/memoir/oral_normalize.py b/api/app/features/memoir/oral_normalize.py index 9fdc300..5af3676 100644 --- a/api/app/features/memoir/oral_normalize.py +++ b/api/app/features/memoir/oral_normalize.py @@ -8,64 +8,21 @@ from __future__ import annotations -import json from typing import Any from app.core.config import settings -from app.core.langchain_llm import invoke_json_object -from app.core.logging import get_logger -from app.features.conversation.input_normalize import apply_conversation_input_rules -from app.features.memoir.memoir_images.json_payload import extract_json_payload - -logger = get_logger(__name__) - - -def apply_oral_normalization_rules(text: str) -> str: - """确定性规则;与 `apply_conversation_input_rules` 等价(memoir 历史名保留)。""" - return apply_conversation_input_rules(text) +from app.core.text_normalize import apply_oral_rules, llm_normalize_text def _llm_normalize_oral(text: str, llm: Any) -> str | None: """仅修正明显错字与同音字,不增事实;失败返回 None。""" - if not llm or not (text or "").strip(): - return None - max_in = int(settings.memoir_oral_normalize_llm_max_input_chars) - t = (text or "").strip() - if len(t) > max_in: - logger.debug( - "event=oral_normalize_llm_skip reason=input_too_long len={} max={}", - len(t), - max_in, - ) - return None - prompt = f"""你是口述转写纠错助手。只修正明显的同音错别字、别字与标点,使句子通顺可读。 -禁止增加事实、不补充细节、不摘要、不改写句式风格;不得新增人名、地名、数字、事件。 -若原文已通顺或无法确定错误,则照抄输入。 - -【用户口述】 -{t} - -**JSON 输出**:只输出一个合法 JSON 对象。 -{{"normalized_text": "纠错后的完整文本(与输入等意,仅修错字与标点)"}} - -只输出 JSON,不要其它文字。""" - try: - raw = invoke_json_object( - llm, - prompt, - max_tokens=int(settings.memoir_oral_normalize_llm_max_tokens), - agent="oral_normalize.llm", - ) - data = json.loads(extract_json_payload(raw)) - if not isinstance(data, dict): - return None - out = (data.get("normalized_text") or "").strip() - if not out: - return None - return out - except Exception as e: - logger.warning("oral_normalize LLM 失败,回退规则结果: {}", e) - return None + return llm_normalize_text( + text, + llm, + max_input_chars=int(settings.memoir_oral_normalize_llm_max_input_chars), + max_tokens=int(settings.memoir_oral_normalize_llm_max_tokens), + agent_name="oral_normalize.llm", + ) def normalize_oral_for_memoir(text: str, *, llm: Any | None = None) -> str: @@ -82,7 +39,7 @@ def normalize_oral_for_memoir(text: str, *, llm: Any | None = None) -> str: if mode == "off": return text or "" - base = apply_oral_normalization_rules(text or "") + base = apply_oral_rules(text or "") if mode != "llm": return base diff --git a/api/app/features/memoir/reading_segment_materialize.py b/api/app/features/memoir/reading_segment_materialize.py index 9a05d30..ed9e40c 100644 --- a/api/app/features/memoir/reading_segment_materialize.py +++ b/api/app/features/memoir/reading_segment_materialize.py @@ -65,8 +65,12 @@ def chapter_body_meets_minimum_for_display(canonical_markdown: str) -> bool: ) -def chapter_meets_minimum_display(ch: Any) -> bool: - """基于章节当前 canonical_markdown(物化后)判断是否可对读者展示。""" +def chapter_meets_minimum_display( + ch: Any, *, canonical_markdown_override: str | None = None +) -> bool: + """基于章节 canonical(或读路径临时物化串)判断是否可对读者展示。""" + if canonical_markdown_override is not None: + return chapter_body_meets_minimum_for_display(canonical_markdown_override) md = getattr(ch, "canonical_markdown", None) or "" return chapter_body_meets_minimum_for_display(str(md)) diff --git a/api/app/features/memoir/service.py b/api/app/features/memoir/service.py index cdbf108..e417d2d 100644 --- a/api/app/features/memoir/service.py +++ b/api/app/features/memoir/service.py @@ -21,11 +21,7 @@ from app.features.memoir.chapter_markdown_compose import ( materialize_chapter_markdown_from_loaded_chapter, ) from app.features.memoir.cover_eligibility import primary_chapter_memoir_image -from app.features.memoir.helpers import ( - chapter_to_dict, - chapter_to_list_dict, - is_image_permanently_unavailable, -) +from app.features.memoir.helpers import chapter_to_dict, chapter_to_list_dict from app.features.memoir.memoir_images.settings import MemoirImageSettings from app.features.memoir.models import Book, Chapter, ChapterStoryLink from app.features.memoir.reading_segment_materialize import ( @@ -37,6 +33,29 @@ from app.ports.storage import ObjectStorage logger = get_logger(__name__) +def prepare_chapter_read_view(chapter: Chapter) -> tuple[Chapter, str | None]: + """ + 读路径:不写入 DB。返回 (chapter, markdown_for_response)。 + None 表示序列化使用 ORM 上的 canonical_markdown。 + """ + has_story_links = bool(getattr(chapter, "story_links", None)) + has_snapshot = chapter.reading_segments_json is not None + dirty = chapter.markdown_compose_dirty is True + + if not dirty: + return chapter, None + + if has_snapshot: + return chapter, None + + if has_story_links: + md = materialize_chapter_markdown_from_loaded_chapter(chapter) + m = (md or "").strip() + return chapter, m if m else None + + return chapter, None + + async def get_or_create_book(user_id: str, db: AsyncSession): """Get the user's current book or return None.""" return await repo.get_current_book(user_id, db) @@ -67,29 +86,6 @@ class MemoirService: bundle = await self._memory.retrieve(user_id, query, top_k=top_k) return bundle.model_dump() - async def _cleanup_unavailable_images(self, ch: Chapter) -> None: - cleaned = False - for rec in getattr(ch, "images", None) or []: - if rec and is_image_permanently_unavailable(rec): - logger.info("清理不可用配图: chapter={}, image={}", ch.id, rec.id) - await self._db.delete(rec) - cleaned = True - if cleaned: - await self._db.commit() - await self._db.refresh(ch) - - async def _ensure_chapter_materialized(self, chapter: Chapter) -> Chapter: - has_story_links = bool(getattr(chapter, "story_links", None)) - has_snapshot = chapter.reading_segments_json is not None - if not has_story_links or (has_snapshot and not chapter.markdown_compose_dirty): - return chapter - - markdown = materialize_chapter_markdown_from_loaded_chapter(chapter) - await repo.append_chapter_compose_version_async(self._db, chapter, markdown) - await self._db.commit() - refreshed = await repo.get_chapter_by_id(chapter.id, self._db) - return refreshed or chapter - async def get_current_book(self, user_id: str) -> dict: book = await repo.get_current_book(user_id, self._db) if not book: @@ -152,8 +148,10 @@ class MemoirService: chapters_raw = list(result.unique().scalars().all()) chapters: List[Chapter] = [] for ch in chapters_raw: - ch2 = await self._ensure_chapter_materialized(ch) - if chapter_meets_minimum_display(ch2): + ch2, md_override = prepare_chapter_read_view(ch) + if chapter_meets_minimum_display( + ch2, canonical_markdown_override=md_override + ): chapters.append(ch2) asset_ids = collect_asset_ids_for_chapters(chapters) asset_map = await signed_urls_for_asset_ids(self._db, asset_ids) @@ -180,11 +178,18 @@ class MemoirService: asset_map = await signed_urls_for_asset_ids(self._db, asset_ids) all_chapters: List[dict] = [] for ch in chapters: - ch = await self._ensure_chapter_materialized(ch) - if not chapter_meets_minimum_display(ch): + ch, md_override = prepare_chapter_read_view(ch) + if not chapter_meets_minimum_display( + ch, canonical_markdown_override=md_override + ): continue - await self._cleanup_unavailable_images(ch) - all_chapters.append(chapter_to_list_dict(ch, asset_url_map=asset_map)) + all_chapters.append( + chapter_to_list_dict( + ch, + asset_url_map=asset_map, + markdown_for_response=md_override, + ) + ) return all_chapters async def get_chapter(self, chapter_id: str, user_id: str) -> dict: @@ -195,14 +200,19 @@ class MemoirService: raise HTTPException(status_code=403, detail="无权访问此章节") if not chapter.is_active: raise HTTPException(status_code=404, detail="Chapter not found") - chapter = await self._ensure_chapter_materialized(chapter) - await self._cleanup_unavailable_images(chapter) - if not chapter_meets_minimum_display(chapter): + chapter, md_override = prepare_chapter_read_view(chapter) + if not chapter_meets_minimum_display( + chapter, canonical_markdown_override=md_override + ): raise HTTPException(status_code=404, detail="Chapter not found") asset_map = await signed_urls_for_asset_ids( self._db, collect_asset_ids_for_chapter(chapter) ) - return chapter_to_dict(chapter, asset_url_map=asset_map) + return chapter_to_dict( + chapter, + asset_url_map=asset_map, + markdown_for_response=md_override, + ) async def disable_chapter(self, chapter_id: str, user_id: str) -> dict: chapter = await repo.get_chapter_by_id(chapter_id, self._db) diff --git a/api/app/features/memoir/story_pipeline_sync.py b/api/app/features/memoir/story_pipeline_sync.py index 93ee30e..a3518b4 100644 --- a/api/app/features/memoir/story_pipeline_sync.py +++ b/api/app/features/memoir/story_pipeline_sync.py @@ -33,7 +33,7 @@ from app.features.memoir.memoir_images.settings import MemoirImageSettings from app.features.memoir.models import Chapter from app.features.memoir.narrative_to_markdown import narrative_to_markdown from app.features.memoir.oral_normalize import ( - apply_oral_normalization_rules, + apply_oral_rules, normalize_oral_for_memoir, ) from app.features.memoir.repo import ( @@ -63,7 +63,7 @@ def _route_segment_texts(category_segments: list) -> list[tuple[str, str]]: and (settings.memoir_oral_normalize_mode or "rules").strip().lower() != "off" ): - t = apply_oral_normalization_rules(raw) + t = apply_oral_rules(raw) else: t = raw out.append((str(seg.id), t)) @@ -312,6 +312,7 @@ def _run_batch_plan_writes( llm: Any, narrative_agent: NarrativeAgent, background_voice: str = "default", + occupation: str = "", ) -> set[str]: dispatch_ids: set[str] = set() max_chars = int(settings.story_append_max_canonical_chars) @@ -362,6 +363,7 @@ def _run_batch_plan_writes( birth_year=user_birth_year, llm=llm, background_voice=background_voice, + occupation=occupation, ) json_invalid = False s0 = (raw_gen or "").strip() @@ -461,6 +463,7 @@ def run_story_pipeline_for_category_batch( user_birth_year: int | None, llm: Any, background_voice: str = "default", + occupation: str = "", ) -> tuple[Chapter | None, bool, set[str]]: """ 返回 (chapter, needs_cover_enqueue, story_ids_to_dispatch_after_commit)。 @@ -589,6 +592,7 @@ def run_story_pipeline_for_category_batch( llm=llm, narrative_agent=narrative_agent, background_voice=background_voice, + occupation=occupation, ) else: route = route_agent.decide( @@ -636,6 +640,7 @@ def run_story_pipeline_for_category_batch( birth_year=user_birth_year, llm=llm, background_voice=background_voice, + occupation=occupation, ) json_invalid = False s0 = (raw_gen or "").strip() diff --git a/api/app/features/payment/router.py b/api/app/features/payment/router.py index fc254eb..bc1a668 100644 --- a/api/app/features/payment/router.py +++ b/api/app/features/payment/router.py @@ -66,7 +66,7 @@ async def wechat_notify( ) except Exception as e: logger.exception("微信支付回调处理失败: {}", e) - return {"code": "FAIL", "message": str(e)} + return {"code": "FAIL", "message": "处理失败"} @router.post("/notify/alipay", include_in_schema=False) diff --git a/api/app/features/story/post_commit.py b/api/app/features/story/post_commit.py index 1eb053b..60ca944 100644 --- a/api/app/features/story/post_commit.py +++ b/api/app/features/story/post_commit.py @@ -100,8 +100,8 @@ def enqueue_story_post_commit_effects( result.errors.append(f"generate_story_image:{sid}:{exc}") try: r.delete(key) - except Exception: - pass + except Exception as e: + logger.debug("Redis key 清理失败: {}", e) if need_recompose and chapter_ids: from app.tasks.chapter_compose_tasks import ( diff --git a/api/app/tasks/chapter_compose_tasks.py b/api/app/tasks/chapter_compose_tasks.py index bd40e8f..6e46681 100644 --- a/api/app/tasks/chapter_compose_tasks.py +++ b/api/app/tasks/chapter_compose_tasks.py @@ -3,7 +3,6 @@ from datetime import datetime, timezone from celery import shared_task -from sqlalchemy import select from app.core.chapter_pipeline_lock import ( acquire_chapter_pipeline_lock, @@ -14,8 +13,7 @@ from app.core.db import get_sync_db from app.core.logging import get_logger from app.core.memory_compaction_schedule import schedule_memory_compaction_run from app.features.memoir import repo as memoir_repo -from app.features.memoir.models import Chapter, ChapterStoryLink -from app.features.story.models import Story +from app.features.memoir.models import Chapter logger = get_logger(__name__) @@ -83,56 +81,3 @@ def recompose_chapter(self, chapter_id: str) -> dict: "composed" if composed else "empty", ) return {"status": "composed" if composed else "empty", "chapter_id": chapter_id} - - -@shared_task(bind=True, max_retries=3, default_retry_delay=30) -def recompose_chapters_for_story(self, story_id: str) -> dict: - """ - 按 story 找出 dirty 章节并物化。 - - .. deprecated:: - 请改用 `recompose_chapter`(按章聚合)+ `enqueue_story_post_commit_effects`; - 保留兼容,待调用方全部迁移后删除。 - """ - user_id: str | None = None - try: - with get_sync_db() as session: - story = session.get(Story, story_id) - user_id = str(story.user_id) if story else None - stmt = ( - select(Chapter.id) - .join( - ChapterStoryLink, - ChapterStoryLink.chapter_id == Chapter.id, - ) - .where( - ChapterStoryLink.story_id == story_id, - Chapter.markdown_compose_dirty.is_(True), - ) - ) - ids = list(session.scalars(stmt).all()) - for cid in ids: - memoir_repo.compose_chapter_from_story_links_sync(session, cid) - session.commit() - if user_id: - schedule_memory_compaction_run( - user_id, - { - "trigger_source": "chapter_recompose", - "trigger_time": datetime.now(timezone.utc).isoformat(), - "pipeline_run_id": str(self.request.id), - "story_ids": [story_id], - "recomposed_chapter_ids": ids, - }, - ) - logger.info( - "recompose_chapters_for_story: story={} recomposed_chapters={}", - story_id, - ids, - ) - return {"story_id": story_id, "recomposed_chapter_ids": ids} - except Exception as exc: - logger.warning( - "recompose_chapters_for_story failed story={} err={}", story_id, exc - ) - raise self.retry(exc=exc) from exc diff --git a/api/app/tasks/memoir_tasks.py b/api/app/tasks/memoir_tasks.py index 06fd9b8..0b0b3ac 100644 --- a/api/app/tasks/memoir_tasks.py +++ b/api/app/tasks/memoir_tasks.py @@ -314,6 +314,7 @@ def process_memoir_segments(self, user_id: str, segment_ids: List[str]): user_profile = "" user_birth_year = None background_voice = "default" + user_occupation = "" if user_obj: user_birth_year = user_obj.birth_year user_profile = format_user_profile_context( @@ -323,6 +324,7 @@ def process_memoir_segments(self, user_id: str, segment_ids: List[str]): occupation=user_obj.occupation, ) background_voice = infer_background_voice(user_obj.occupation) + user_occupation = user_obj.occupation or "" story_dispatch_ids: Set[str] = set() @@ -382,6 +384,7 @@ def process_memoir_segments(self, user_id: str, segment_ids: List[str]): user_birth_year=user_birth_year, llm=llm, background_voice=background_voice, + occupation=user_occupation, ) story_dispatch_ids |= disp db.flush() @@ -512,6 +515,7 @@ def generate_chapter_content(self, user_id: str, stage: str, new_content: str): user_profile = "" user_birth_year = None background_voice = "default" + user_occupation = "" if user_obj: user_birth_year = user_obj.birth_year user_profile = format_user_profile_context( @@ -521,6 +525,7 @@ def generate_chapter_content(self, user_id: str, stage: str, new_content: str): occupation=user_obj.occupation, ) background_voice = infer_background_voice(user_obj.occupation) + user_occupation = user_obj.occupation or "" class _Seg: def __init__(self, text: str): @@ -538,6 +543,7 @@ def generate_chapter_content(self, user_id: str, stage: str, new_content: str): user_birth_year=user_birth_year, llm=llm, background_voice=background_voice, + occupation=user_occupation, ) db.commit() db.refresh(chapter) diff --git a/api/tests/test_background_voice.py b/api/tests/test_background_voice.py index 5764e84..8062362 100644 --- a/api/tests/test_background_voice.py +++ b/api/tests/test_background_voice.py @@ -1,6 +1,8 @@ """职业文本推断 background_voice(干部/军队)。""" from app.agents.chat.background_voice import ( + get_background_voice_chat_block, + get_background_voice_narrative_block, infer_background_voice, normalize_background_voice, ) @@ -38,3 +40,12 @@ def test_narrative_editor_system_prompt_appends_voice() -> None: mil = get_narrative_editor_system_prompt("military") assert len(mil) > len(base) assert "背景文体(军队" in mil + + +def test_cadre_military_blocks_include_retirement_context() -> None: + chat_c = get_background_voice_chat_block("cadre") + chat_m = get_background_voice_chat_block("military") + narr_c = get_background_voice_narrative_block("cadre") + narr_m = get_background_voice_narrative_block("military") + assert "退休" in chat_c and "退休" in narr_c + assert "退役" in chat_m and "退役" in narr_m diff --git a/api/tests/test_chapter_read_view.py b/api/tests/test_chapter_read_view.py new file mode 100644 index 0000000..d9f43af --- /dev/null +++ b/api/tests/test_chapter_read_view.py @@ -0,0 +1,62 @@ +"""章节读路径 prepare_chapter_read_view:不写库、临时物化串用于列表/详情。""" + +from unittest.mock import MagicMock, patch + +from app.features.memoir.reading_segment_materialize import ( + chapter_meets_minimum_display, +) +from app.features.memoir.service import prepare_chapter_read_view + + +def test_prepare_not_dirty_no_override() -> None: + ch = MagicMock() + ch.markdown_compose_dirty = False + ch.reading_segments_json = None + ch.story_links = [MagicMock()] + c, md = prepare_chapter_read_view(ch) + assert c is ch + assert md is None + + +def test_prepare_dirty_with_snapshot_no_override() -> None: + ch = MagicMock() + ch.markdown_compose_dirty = True + ch.reading_segments_json = [{"story_id": "s1", "body_markdown": "x" * 400}] + ch.story_links = [MagicMock()] + c, md = prepare_chapter_read_view(ch) + assert md is None + + +def test_prepare_dirty_no_snapshot_no_links_no_override() -> None: + ch = MagicMock() + ch.markdown_compose_dirty = True + ch.reading_segments_json = None + ch.story_links = [] + c, md = prepare_chapter_read_view(ch) + assert md is None + + +def test_prepare_dirty_no_snapshot_with_links_uses_materialize() -> None: + ch = MagicMock() + ch.markdown_compose_dirty = True + ch.reading_segments_json = None + ch.story_links = [MagicMock()] + fake_md = "正文" * 200 + with patch( + "app.features.memoir.service.materialize_chapter_markdown_from_loaded_chapter", + return_value=fake_md, + ) as m: + c, md = prepare_chapter_read_view(ch) + m.assert_called_once_with(ch) + assert md == fake_md.strip() + assert c is ch + + +def test_chapter_meets_minimum_uses_override() -> None: + ch = MagicMock() + ch.canonical_markdown = "" + long_md = "字" * 400 + assert ( + chapter_meets_minimum_display(ch, canonical_markdown_override=long_md) is True + ) + assert chapter_meets_minimum_display(ch, canonical_markdown_override="短") is False diff --git a/api/tests/test_http_contract_errors.py b/api/tests/test_http_contract_errors.py new file mode 100644 index 0000000..7f145a2 --- /dev/null +++ b/api/tests/test_http_contract_errors.py @@ -0,0 +1,75 @@ +"""HTTP 层对外错误文案脱敏契约(响应体不含内部异常串)。""" + +from io import BytesIO +from unittest.mock import MagicMock + +import pytest +from httpx import ASGITransport, AsyncClient + +from app.core.dependencies import get_current_user +from app.features.auth.deps import get_auth_service +from app.features.auth.router import router as auth_router +from app.features.payment.deps import get_payment_order_service +from app.features.payment.router import router as payment_router + + +@pytest.mark.asyncio +async def test_wechat_notify_returns_fixed_message_on_service_error() -> None: + from fastapi import FastAPI + + class BoomOrderService: + async def handle_wechat_notify(self, *, headers: dict, body: str): + raise RuntimeError("wechat_sdk_secret_123") + + app = FastAPI() + app.include_router(payment_router) + app.dependency_overrides[get_payment_order_service] = lambda: BoomOrderService() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + r = await client.post("/api/payment/notify/wechat", content="raw") + assert r.status_code == 200 + data = r.json() + assert data.get("code") == "FAIL" + assert data.get("message") == "处理失败" + assert "wechat_sdk" not in r.text + assert "secret" not in r.text.lower() + + +def _minimal_jpeg_bytes() -> bytes: + """1x1 JPEG 最小合法文件。""" + return ( + b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00" + b"\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.' \",#\x1c\x1c(7),01444\x1f'9=82<.342\xff\xc0\x00\x11\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xc4\x00\x14\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\xff\xc4\x00\x14\x10\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xda\x00\x0c\x03\x01\x00\x02\x11\x03\x11\x00\x3f\x00\xaa\xff\xd9" + ) + + +@pytest.mark.asyncio +async def test_avatar_upload_500_detail_sanitized( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from fastapi import FastAPI + + fake_user = MagicMock() + fake_user.id = "user-contract-test" + + class BoomAuth: + async def update_avatar_url(self, user_id: str, avatar_url: str): + raise RuntimeError("db_connection_secret_xyz") + + app = FastAPI() + app.include_router(auth_router) + app.dependency_overrides[get_current_user] = lambda: fake_user + app.dependency_overrides[get_auth_service] = lambda: BoomAuth() + + transport = ASGITransport(app=app) + files = {"file": ("a.jpg", BytesIO(_minimal_jpeg_bytes()), "image/jpeg")} + async with AsyncClient(transport=transport, base_url="http://test") as client: + r = await client.post("/api/auth/me/avatar", files=files) + + assert r.status_code == 500 + body = r.json() + detail = body.get("detail", "") + assert detail == "处理图片失败,请重试" + assert "secret" not in str(detail).lower() + assert "db_connection" not in r.text diff --git a/api/tests/test_interview_reply_length.py b/api/tests/test_interview_reply_length.py index 75a1057..de60f0a 100644 --- a/api/tests/test_interview_reply_length.py +++ b/api/tests/test_interview_reply_length.py @@ -7,8 +7,7 @@ import pytest from app.agents.chat.interview_reply_length import ( ReplyLengthMode, - bump_reply_length_strategy_for_background_voice, - compute_reply_length_strategy, + bump_reply_plan_for_background_voice, compute_reply_plan, ) from app.agents.state_schema import MemoirStateSchema @@ -29,10 +28,9 @@ def _fake_settings(**overrides: object) -> SimpleNamespace: def test_strategy_brief_when_very_short() -> None: - s = compute_reply_length_strategy( - 5, - likely_new_detail=False, - likely_chit_chat=False, + s = compute_reply_plan( + "x" * 5, + background_voice=None, settings=_fake_settings(), ) assert s.mode == ReplyLengthMode.brief @@ -41,10 +39,9 @@ def test_strategy_brief_when_very_short() -> None: def test_strategy_standard_mid_length() -> None: - s = compute_reply_length_strategy( - 50, - likely_new_detail=True, - likely_chit_chat=False, + s = compute_reply_plan( + "x" * 50, + background_voice=None, settings=_fake_settings(), ) assert s.mode == ReplyLengthMode.standard @@ -53,10 +50,11 @@ def test_strategy_standard_mid_length() -> None: def test_strategy_long_chit_stays_standard() -> None: - s = compute_reply_length_strategy( - 120, - likely_new_detail=False, - likely_chit_chat=True, + msg = "今天天气真好哈哈" * 11 + assert len(msg) >= 80 + s = compute_reply_plan( + msg, + background_voice=None, settings=_fake_settings(), ) assert s.mode == ReplyLengthMode.standard @@ -64,10 +62,12 @@ def test_strategy_long_chit_stays_standard() -> None: def test_strategy_long_with_new_detail_expanded() -> None: - s = compute_reply_length_strategy( - 120, - likely_new_detail=True, - likely_chit_chat=False, + base = "第一次认识他" + msg = (base + "x" * 200)[:120] + assert len(msg) == 120 + s = compute_reply_plan( + msg, + background_voice=None, settings=_fake_settings(), ) assert s.mode == ReplyLengthMode.expanded @@ -76,24 +76,27 @@ def test_strategy_long_with_new_detail_expanded() -> None: def test_strategy_boundary_len_20_brief_len_21_standard() -> None: - a = compute_reply_length_strategy( - 20, likely_new_detail=False, likely_chit_chat=False, settings=_fake_settings() + a = compute_reply_plan( + "x" * 20, + background_voice=None, + settings=_fake_settings(), ) - b = compute_reply_length_strategy( - 21, likely_new_detail=False, likely_chit_chat=False, settings=_fake_settings() + b = compute_reply_plan( + "x" * 21, + background_voice=None, + settings=_fake_settings(), ) assert a.mode == ReplyLengthMode.brief assert b.mode == ReplyLengthMode.standard def test_bump_standard_only_for_cadre_military() -> None: - s0 = compute_reply_length_strategy( - 50, - likely_new_detail=False, - likely_chit_chat=False, + s0 = compute_reply_plan( + "x" * 50, + background_voice=None, settings=_fake_settings(), ) - bumped = bump_reply_length_strategy_for_background_voice( + bumped = bump_reply_plan_for_background_voice( s0, background_voice="cadre", settings=_fake_settings( @@ -104,16 +107,15 @@ def test_bump_standard_only_for_cadre_military() -> None: assert bumped.max_tokens == s0.max_tokens + 40 assert bumped.max_chars_per_segment == s0.max_chars_per_segment + 40 - brief = compute_reply_length_strategy( - 5, - likely_new_detail=False, - likely_chit_chat=False, + brief = compute_reply_plan( + "x" * 5, + background_voice=None, settings=_fake_settings( chat_interview_cadre_military_standard_extra_tokens=40, chat_interview_cadre_military_standard_extra_chars=40, ), ) - same = bump_reply_length_strategy_for_background_voice( + same = bump_reply_plan_for_background_voice( brief, background_voice="military", settings=_fake_settings( @@ -149,13 +151,14 @@ def test_plan_long_chit_stays_standard_not_expanded() -> None: def test_strategy_boundary_len_79_standard_len_80_long_branch() -> None: - a = compute_reply_length_strategy( - 79, likely_new_detail=False, likely_chit_chat=False, settings=_fake_settings() + a = compute_reply_plan( + "x" * 79, + background_voice=None, + settings=_fake_settings(), ) - b = compute_reply_length_strategy( - 80, - likely_new_detail=False, - likely_chit_chat=False, + b = compute_reply_plan( + "x" * 80, + background_voice=None, settings=_fake_settings(), ) assert a.mode == ReplyLengthMode.standard diff --git a/api/tests/test_occupation_context.py b/api/tests/test_occupation_context.py new file mode 100644 index 0000000..6c3a1ba --- /dev/null +++ b/api/tests/test_occupation_context.py @@ -0,0 +1,21 @@ +"""default 路径职业提示(与 cadre/military 专属块正交)。""" + +from app.agents.chat.occupation_context import ( + get_occupation_chat_hint, + get_occupation_narrative_hint, +) + + +def test_chat_hint_only_default_with_occupation() -> None: + t = get_occupation_chat_hint("教师", "default") + assert "教师" in t + assert get_occupation_chat_hint("教师", "cadre") == "" + assert get_occupation_chat_hint("教师", "military") == "" + assert get_occupation_chat_hint(None, "default") == "" + assert get_occupation_chat_hint(" ", "default") == "" + + +def test_narrative_hint_only_default_with_occupation() -> None: + t = get_occupation_narrative_hint("工程师", "default") + assert "工程师" in t and "禁止" in t + assert get_occupation_narrative_hint("工程师", "military") == "" diff --git a/api/tests/test_oral_normalize.py b/api/tests/test_oral_normalize.py index 73ddcea..8ddcbf1 100644 --- a/api/tests/test_oral_normalize.py +++ b/api/tests/test_oral_normalize.py @@ -2,24 +2,22 @@ from unittest.mock import patch -from app.features.memoir.oral_normalize import ( - apply_oral_normalization_rules, - normalize_oral_for_memoir, -) +from app.core.text_normalize import apply_oral_rules +from app.features.memoir.oral_normalize import normalize_oral_for_memoir def test_apply_rules_mei_kanshang_wo() -> None: - assert "没看上我" in apply_oral_normalization_rules("我去试镜了 美看上我 张伟") + assert "没看上我" in apply_oral_rules("我去试镜了 美看上我 张伟") def test_apply_rules_mei_kanshang_ni() -> None: - assert apply_oral_normalization_rules("美看上你") == "没看上你" + assert apply_oral_rules("美看上你") == "没看上你" def test_apply_rules_no_false_positive_rong() -> None: """「美容」等不应被误替换。""" s = "我去了解美容项目" - assert apply_oral_normalization_rules(s) == s + assert apply_oral_rules(s) == s def test_normalize_respects_global_off() -> None: