diff --git a/api/app/agents/chat/agent_turn.py b/api/app/agents/chat/agent_turn.py index dfa4769..2c44706 100644 --- a/api/app/agents/chat/agent_turn.py +++ b/api/app/agents/chat/agent_turn.py @@ -13,3 +13,4 @@ class AgentChatTurn: messages: List[str] skip_tts: bool = False memory_retrieval_trace: dict[str, Any] | None = None + interview_state_meta: dict[str, Any] | None = None diff --git a/api/app/agents/chat/interview_agent.py b/api/app/agents/chat/interview_agent.py index 3e3e3b8..066c1ca 100644 --- a/api/app/agents/chat/interview_agent.py +++ b/api/app/agents/chat/interview_agent.py @@ -10,6 +10,11 @@ from langchain_core.messages import HumanMessage, SystemMessage from app.agents.chat.agent_turn import AgentChatTurn from app.agents.chat.helpers import format_history_string, get_history_with_window +from app.agents.chat.interview_state_hints import ( + apply_duplicate_question_guard, + extract_recent_questions, + update_recent_questions, +) from app.agents.chat.personas import normalize_interview_persona from app.agents.chat.prompt_context import ChatPromptContext from app.agents.chat.prompts_conversation import ( @@ -103,7 +108,7 @@ class InterviewAgent: text_for_model = self._resolve_text_for_model( user_message, normalized_user_message ) - empty_slots = memoir_state.empty_slots_for_current_stage() + empty_slots = memoir_state.prompt_empty_slots_for_current_stage() filled_slots = { key: value.snippet for key, value in memoir_state.slots.get( @@ -120,6 +125,7 @@ class InterviewAgent: max_pairs=settings.chat_history_max_pairs, max_chars=settings.chat_history_max_chars, ) + recent_questions = extract_recent_questions(hw.window) conversation_turn_total = hw.turn_total all_stages_coverage = memoir_state.all_stages_coverage() persona = normalize_interview_persona(settings.chat_interview_persona) @@ -140,6 +146,9 @@ class InterviewAgent: occupation=occupation, profile_birth_year=profile_birth_year, profile_era_place=profile_era_place, + known_facts=memoir_state.known_facts, + persona_threads=memoir_state.persona_threads, + recent_questions=recent_questions or memoir_state.recent_questions, ) system_prompt = ctx.guided_system_prompt() messages: List[Any] = [SystemMessage(content=system_prompt)] @@ -204,6 +213,15 @@ class InterviewAgent: if not out: out = [response_text.strip()[:max_chars]] out = nonempty_segments_or_fallback(out, fallback=_FALLBACK_REPLY) + out, deduped = apply_duplicate_question_guard( + out, + state=memoir_state, + recent_questions=recent_questions or memoir_state.recent_questions, + ) + updated_recent_questions = update_recent_questions( + recent_questions or memoir_state.recent_questions, + out, + ) log_agent_summary( logger, "InterviewAgent.generate_response segments={} conversation_id={} " @@ -212,7 +230,14 @@ class InterviewAgent: conversation_id, max_tokens, ) - return AgentChatTurn(messages=out, skip_tts=False) + return AgentChatTurn( + messages=out, + skip_tts=False, + interview_state_meta={ + "recent_questions": updated_recent_questions, + "duplicate_question_guard_triggered": deduped, + }, + ) except Exception as e: logger.error("生成回应失败: {}", e, exc_info=True) return AgentChatTurn(messages=[_FALLBACK_REPLY], skip_tts=True) @@ -231,7 +256,7 @@ class InterviewAgent: if not self.llm: return ["你好呀~ 又见面了,今天有没有哪段回忆或近况想聊聊?"] try: - empty_slots = memoir_state.empty_slots_for_current_stage() + empty_slots = memoir_state.prompt_empty_slots_for_current_stage() empty_slots_readable = [SLOT_NAME_MAP.get(s, s) for s in empty_slots] persona = normalize_interview_persona(settings.chat_interview_persona) prompt = get_opening_prompt( diff --git a/api/app/agents/chat/interview_state_hints.py b/api/app/agents/chat/interview_state_hints.py new file mode 100644 index 0000000..5ba98f2 --- /dev/null +++ b/api/app/agents/chat/interview_state_hints.py @@ -0,0 +1,327 @@ +"""Interview quality helpers: known facts, persona threads, and anti-repeat guard.""" + +from __future__ import annotations + +import re +from collections.abc import Iterable + +from langchain_core.messages import AIMessage, BaseMessage, HumanMessage + +from app.agents.stage_constants import STAGE_DISPLAY_ZH, STAGE_SLOT_KEYS +from app.agents.state_schema import KnownFact, MemoirStateSchema, PersonaThread + +_QUESTION_SPLIT_RE = re.compile(r"[??]+") +_SENTENCE_SPLIT_RE = re.compile(r"(?<=[。!?!?])") +_PUNCT_RE = re.compile(r"[\s,。!?;:、“”‘’()()《》【】\\[\\],.!?:;\"'`~·…-]+") + +_TRAIT_HINTS: tuple[tuple[str, tuple[str, ...]], ...] = ( + ("执着坚持", ("坚持", "执着", "咬牙", "熬过", "顶住", "训练", "反复")), + ("规划目标感", ("计划", "规划", "目标", "打算", "一步步", "安排", "准备")), + ("求真较真", ("弄明白", "搞清楚", "想通", "为什么", "较真", "求证")), + ("行动力", ("决定", "创业", "开始做", "尝试", "报名", "跑去", "去做")), + ("家庭责任感", ("家里", "父母", "妈妈", "爸爸", "妻子", "丈夫", "孩子", "照顾", "支持")), + ("即时反馈驱动", ("反馈", "看到结果", "成就感", "立刻", "马上见效")), + ("自由天性", ("自由", "无拘无束", "满世界跑", "疯玩", "野", "不管")), + ("动手创造", ("自己动手", "搭", "做", "造", "修", "拆", "烤", "生火", "种")), + ("重感情念旧", ("想起来", "怀念", "舍不得", "还记得", "那时候", "小时候")), + ("好胜争先", ("比赛", "赢", "比", "第一", "不服输", "较劲")), +) + +_SCENE_CUE_WORDS: tuple[tuple[str, str], ...] = ( + ("田野", "田野的泥土和青草气息"), + ("河里", "河水的凉意"), + ("海边", "海风和咸咸的空气"), + ("溜冰", "冰面上咔嚓咔嚓的声响"), + ("游泳", "一头扎进水里的畅快"), + ("烤红薯", "红薯外焦里糯、掰开冒热气的香味"), + ("烤", "火堆噼啪响、烟气里混着食物焦香"), + ("打水漂", "石片在水面跳跃、一圈一圈涟漪散开"), + ("捉", "追着跑、手心攥紧怕跑掉的紧张"), + ("雪", "雪花落在脸上化成水珠的凉"), + ("风", "风灌进领子里的感觉"), + ("下雨", "雨点打在屋顶上的声音"), + ("自行车", "骑车下坡风呼呼吹过耳朵"), + ("火车", "绿皮车厢里混着泡面和橘子皮的味道"), + ("学校", "教室里粉笔灰飘在阳光里的样子"), + ("考试", "翻卷子时纸张沙沙响"), + ("工厂", "机器轰鸣、油污和铁锈的气味"), + ("做饭", "锅铲碰锅底的声响、油花溅起来的滋滋声"), +) + + +def extract_scene_cues(user_message: str) -> list[str]: + msg = (user_message or "").strip() + if not msg: + return [] + cues: list[str] = [] + for keyword, description in _SCENE_CUE_WORDS: + if keyword in msg: + cues.append(description) + return cues[:3] + +_SLOT_REPEAT_PATTERNS: dict[str, tuple[str, ...]] = { + "place": ("哪里长大", "家乡", "老家", "在哪长大", "什么地方长大"), + "people": ("谁对你影响", "家里都有谁", "小时候和谁", "身边有什么人"), + "daily_life": ("平时怎么过", "日常都做什么", "小时候都玩什么"), + "emotion": ("那时候什么感觉", "当时什么感受", "小时候开心吗"), + "turning_event": ("印象最深的事", "难忘的事", "转折"), + "school": ("什么学校", "在哪上学", "读的什么学校", "上什么学校"), + "city": ("在哪个城市", "去了哪里读书", "在哪读书"), + "motivation": ("为什么想学", "为什么选这个", "动力是什么"), + "challenge": ("遇到什么困难", "最大的难处", "辛苦吗"), + "change": ("后来有什么变化", "这件事怎么改变你", "之后有什么不同"), + "job": ("做什么工作", "具体做什么", "工作内容是什么"), + "environment": ("工作环境", "在哪工作", "什么单位"), + "decision": ("为什么做这个决定", "怎么决定的"), + "pressure": ("压力大吗", "最难的时候", "最大的压力"), + "growth": ("学到了什么", "成长在哪里", "后来有什么提升"), + "relationship": ("和家人关系怎么样", "和伴侣关系怎么样"), + "conflict": ("有什么矛盾", "怎么吵起来的", "冲突"), + "support": ("谁支持你", "谁帮过你", "怎么支持你的"), + "responsibility": ("承担什么责任", "家里靠谁", "你主要负责什么"), + "value": ("你最看重什么", "信念是什么", "原则是什么"), + "regret": ("最大的遗憾", "后悔过吗"), + "pride": ("最骄傲的事", "最自豪的事"), + "lesson": ("学到了什么道理", "最大的感悟", "给你什么启发"), +} + + +def _normalize_text(text: str) -> str: + return _PUNCT_RE.sub("", (text or "").strip().lower()) + + +def _dedupe_keep_last(items: Iterable[str], *, limit: int) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for raw in reversed([str(x).strip() for x in items if str(x).strip()]): + key = _normalize_text(raw) + if not key or key in seen: + continue + seen.add(key) + out.append(raw) + if len(out) >= limit: + break + out.reverse() + return out + + +def _merge_known_facts( + existing: Iterable[KnownFact], + additions: Iterable[KnownFact], + *, + limit: int = 24, +) -> list[KnownFact]: + merged: dict[tuple[str, str, str], KnownFact] = {} + for item in list(existing) + list(additions): + key = ( + (item.stage or "").strip(), + (item.slot_name or "").strip(), + _normalize_text(f"{item.label}:{item.value}"), + ) + if not key[2]: + continue + merged[key] = item + values = list(merged.values())[-limit:] + return values + + +def _merge_persona_threads( + existing: Iterable[PersonaThread], + additions: Iterable[PersonaThread], + *, + limit: int = 12, +) -> list[PersonaThread]: + merged: dict[tuple[str, str], PersonaThread] = {} + for item in list(existing) + list(additions): + key = (_normalize_text(item.trait), _normalize_text(item.evidence)) + if not key[0]: + continue + merged[key] = item + values = list(merged.values())[-limit:] + return values + + +def _trim_sentence(text: str, *, limit: int = 80) -> str: + s = re.sub(r"\s+", " ", (text or "").strip()) + if len(s) <= limit: + return s + return s[: limit - 1].rstrip() + "…" + + +def build_runtime_interview_state( + state: MemoirStateSchema, + *, + user_message: str, + active_stage: str, + birth_year: int | None = None, + birth_place: str = "", + grew_up_place: str = "", + occupation: str = "", +) -> MemoirStateSchema: + """Merge current-turn hints into a prompt-only state view.""" + additions: list[KnownFact] = [] + if birth_year: + additions.append( + KnownFact( + label="出生年份", + value=f"{birth_year}年", + source="profile", + ) + ) + if birth_place: + additions.append( + KnownFact( + label="出生地", + value=birth_place.strip(), + source="profile", + stage="childhood", + slot_name="place", + ) + ) + if grew_up_place: + additions.append( + KnownFact( + label="成长地", + value=grew_up_place.strip(), + source="profile", + stage="childhood", + slot_name="place", + ) + ) + if occupation: + additions.append( + KnownFact( + label="职业背景", + value=occupation.strip(), + source="profile", + stage="career", + slot_name="job", + ) + ) + + msg = _trim_sentence(user_message, limit=120) + if msg: + additions.append( + KnownFact( + label="本轮新信息", + value=msg, + source="current_turn", + stage=active_stage, + ) + ) + + persona_additions: list[PersonaThread] = [] + haystack = " ".join( + [msg] + + [fact.value for fact in state.known_facts[-8:]] + + list(state.filled_slots_for_stage(active_stage).values())[:4] + ) + for trait, markers in _TRAIT_HINTS: + for marker in markers: + if marker and marker in haystack: + persona_additions.append( + PersonaThread( + trait=trait, + evidence=_trim_sentence(marker if marker in msg else haystack, limit=70), + source="heuristic", + stage=active_stage, + ) + ) + break + + return state.model_copy( + update={ + "known_facts": _merge_known_facts(state.known_facts, additions), + "persona_threads": _merge_persona_threads( + state.persona_threads, persona_additions + ), + } + ) + + +def extract_recent_questions(messages: Iterable[BaseMessage], *, limit: int = 4) -> list[str]: + questions: list[str] = [] + for msg in messages: + if not isinstance(msg, AIMessage): + continue + text = str(getattr(msg, "content", "") or "").strip() + if not text: + continue + for part in _QUESTION_SPLIT_RE.split(text): + part = part.strip() + if not part: + continue + if any(w in text for w in ("?", "?")): + questions.append(_trim_sentence(part + "?", limit=50)) + return _dedupe_keep_last(questions, limit=limit) + + +def update_recent_questions( + existing: Iterable[str], + generated_segments: Iterable[str], + *, + limit: int = 4, +) -> list[str]: + fresh: list[str] = list(existing) + for seg in generated_segments: + text = str(seg or "").strip() + if not text or ("?" not in text and "?" not in text): + continue + parts = [p.strip() for p in _QUESTION_SPLIT_RE.split(text) if p.strip()] + if not parts: + continue + fresh.append(_trim_sentence(parts[-1] + "?", limit=50)) + return _dedupe_keep_last(fresh, limit=limit) + + +def apply_duplicate_question_guard( + segments: Iterable[str], + *, + state: MemoirStateSchema, + recent_questions: Iterable[str], +) -> tuple[list[str], bool]: + """Downgrade obvious repeated-fact questions into acknowledgment-only text.""" + recent_norms = {_normalize_text(q) for q in recent_questions if _normalize_text(q)} + known_patterns: list[str] = [] + for fact in state.known_facts: + slot_patterns = _SLOT_REPEAT_PATTERNS.get(fact.slot_name or "", ()) + known_patterns.extend(slot_patterns) + if fact.label == "本轮新信息": + known_patterns.append(fact.value) + cleaned: list[str] = [] + touched = False + for seg in segments: + text = str(seg or "").strip() + if not text: + continue + text_norm = _normalize_text(text) + repeated = False + if ("?" in text or "?" in text) and text_norm: + if any(q and (q in text_norm or text_norm in q) for q in recent_norms): + repeated = True + if not repeated: + for pattern in known_patterns: + pat_norm = _normalize_text(pattern) + if pat_norm and pat_norm in text_norm: + repeated = True + break + if repeated: + sentences = [s.strip() for s in _SENTENCE_SPLIT_RE.split(text) if s.strip()] + kept = [s for s in sentences if "?" not in s and "?" not in s] + replacement = kept[0] if kept else "这一段我记住了。" + if not replacement.endswith(("。", "!", "…")): + replacement += "。" + cleaned.append(replacement) + touched = True + else: + cleaned.append(text) + if not cleaned: + cleaned = ["这一段我记住了。"] + return cleaned, touched + + +def stage_slot_hint_lines(stage: str) -> list[str]: + keys = STAGE_SLOT_KEYS.get(stage, ()) + stage_zh = STAGE_DISPLAY_ZH.get(stage, stage) + return [f"{stage_zh}:{key}" for key in keys] + diff --git a/api/app/agents/chat/orchestrator.py b/api/app/agents/chat/orchestrator.py index 7f14ad8..371f2a5 100644 --- a/api/app/agents/chat/orchestrator.py +++ b/api/app/agents/chat/orchestrator.py @@ -12,6 +12,10 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.agents.chat.agent_turn import AgentChatTurn from app.agents.chat.helpers import get_history_with_window from app.agents.chat.interview_agent import InterviewAgent +from app.agents.chat.interview_state_hints import ( + build_runtime_interview_state, + extract_scene_cues, +) from app.agents.chat.profile_agent import ProfileAgent from app.agents.chat.stage_detection import ( detect_primary_life_stage, @@ -23,7 +27,11 @@ from app.core.config import settings from app.core.dependencies import get_llm_provider from app.core.logging import get_logger from app.features.conversation.input_normalize import normalize_chat_input_for_agent -from app.features.memoir.state_service import get_or_create_state, switch_stage +from app.features.memoir.state_service import ( + get_or_create_state, + save_interview_state_meta, + switch_stage, +) def _llm_for_chat_input_normalize(): @@ -275,6 +283,13 @@ class ChatOrchestrator: memory_evidence_text, mem_trace = await _fetch_interview_memory_evidence( db, user_id, normalized_user_message ) + scene_cues = extract_scene_cues(normalized_user_message) + if scene_cues: + cue_block = "\n".join(f"- {c}" for c in scene_cues) + scene_hint = ( + f"\n\n[场景氛围提示——可借用这些感官细节自然接话,不要原样抄]\n{cue_block}" + ) + memory_evidence_text = (memory_evidence_text or "") + scene_hint profile_birth_year = user.birth_year if user else None profile_era_place = "" @@ -282,11 +297,20 @@ class ChatOrchestrator: profile_era_place = ( (user.birth_place or user.grew_up_place or "").strip() ) + prompt_state = build_runtime_interview_state( + state, + user_message=normalized_user_message, + active_stage=detected or state.current_stage, + birth_year=profile_birth_year, + birth_place=(user.birth_place or "").strip() if user else "", + grew_up_place=(user.grew_up_place or "").strip() if user else "", + occupation=occupation, + ) turn = await self.interview_agent.generate_response_with_state( conversation_id=conversation_id, user_message=user_message, - memoir_state=state, + memoir_state=prompt_state, user_profile_context=user_profile_context, detected_user_stage=detected, memory_evidence_text=memory_evidence_text, @@ -296,6 +320,20 @@ class ChatOrchestrator: profile_birth_year=profile_birth_year, profile_era_place=profile_era_place, ) + recent_questions = prompt_state.recent_questions + if turn.interview_state_meta and isinstance(turn.interview_state_meta, dict): + raw_recent = turn.interview_state_meta.get("recent_questions") + if isinstance(raw_recent, list): + recent_questions = [ + str(x).strip() for x in raw_recent if str(x).strip() + ] + await save_interview_state_meta( + user_id, + known_facts=prompt_state.known_facts, + persona_threads=prompt_state.persona_threads, + recent_questions=recent_questions, + db=db, + ) if agent_summary_enabled(): logger.info( "ChatOrchestrator.process_user_message route=interview " @@ -311,6 +349,7 @@ class ChatOrchestrator: messages=turn.messages, skip_tts=turn.skip_tts, memory_retrieval_trace=mem_trace, + interview_state_meta=turn.interview_state_meta, ) return turn diff --git a/api/app/agents/chat/output_rules.py b/api/app/agents/chat/output_rules.py index f9c0877..0ed0f5e 100644 --- a/api/app/agents/chat/output_rules.py +++ b/api/app/agents/chat/output_rules.py @@ -1,4 +1,4 @@ -"""共用用户可见回复禁令(访谈 / 资料收集)。""" +"""共用用户可见回复禁令与文风(访谈 / 资料收集 / 所有面向用户的 Agent)。""" def chat_output_rules() -> str: @@ -14,4 +14,15 @@ def chat_output_rules() -> str: ) -__all__ = ["chat_output_rules"] +def chat_voice_style() -> str: + """所有面向用户的 Agent 共用的文风指引。""" + return ( + "语气像好朋友微信聊天:自然、温暖、偶尔俏皮。" + "接话时允许带一点画面感或感官细节(一两句即可,不要堆砌)。" + "用对方刚说的**那个具体细节**回应,不要写成泛泛的总结。" + "不要用总结腔('听起来你的童年很快乐'),要用对话腔('那种……的感觉,现在想起来都觉得……')。" + "追问优先顺着对方刚说的具体细节往里走一层,不要跳到泛泛的新问题。" + ) + + +__all__ = ["chat_output_rules", "chat_voice_style"] diff --git a/api/app/agents/chat/personas.py b/api/app/agents/chat/personas.py index b54407a..3e7a913 100644 --- a/api/app/agents/chat/personas.py +++ b/api/app/agents/chat/personas.py @@ -21,18 +21,24 @@ def normalize_interview_persona(raw: str | None) -> str: def get_interview_persona_tone_hint(persona: str) -> str: - """一句访谈性格提示,融入主 system prompt;default 返回空串。""" + """访谈性格提示,融入主 system prompt。""" key = normalize_interview_persona(persona) if key == "default": - return "" + return ( + "语气像好朋友微信聊天:自然、温暖、偶尔俏皮;" + "接话时允许带一点画面感或感官细节(一两句即可,不要堆砌);" + "追问优先顺着对方刚说的**具体细节**往里走一层,不要跳到泛泛的新问题。" + ) if key == "warm_listener": return ( "偏倾听与承接,语气柔和、少打断;不审问感,一次最多一个具体问题。" "对方愿意展开时,可温和多问一层意义或影响。" + "接话时允许带一点画面感或感官细节(一两句即可),让对方觉得你真的在跟着想象。" ) return ( "爱把人往一个具体细节里带;事实清楚后可追问对自我认知或后来选择的影响;" "短句像微信,一次最多一个具体问题,不重复上文已清楚的事。" + "允许用一两句场景感的短描写承接对方画面,不要只用干巴巴的确认句。" ) diff --git a/api/app/agents/chat/prompt_context.py b/api/app/agents/chat/prompt_context.py index 481cd7c..35b63b5 100644 --- a/api/app/agents/chat/prompt_context.py +++ b/api/app/agents/chat/prompt_context.py @@ -5,6 +5,8 @@ from __future__ import annotations from dataclasses import dataclass from typing import Dict, List, Optional +from app.agents.state_schema import KnownFact, PersonaThread + @dataclass class ChatPromptContext: @@ -22,6 +24,9 @@ class ChatPromptContext: occupation: str = "" profile_birth_year: int | None = None profile_era_place: str = "" + known_facts: List[KnownFact] | None = None + persona_threads: List[PersonaThread] | None = None + recent_questions: List[str] | None = None def guided_system_prompt(self) -> str: """用户原话仅以对话历史 + HumanMessage 注入模型。""" @@ -40,4 +45,7 @@ class ChatPromptContext: occupation=self.occupation, profile_birth_year=self.profile_birth_year, profile_era_place=self.profile_era_place, + known_facts=self.known_facts or [], + persona_threads=self.persona_threads or [], + recent_questions=self.recent_questions or [], ) diff --git a/api/app/agents/chat/prompts_conversation.py b/api/app/agents/chat/prompts_conversation.py index 6b31e7c..41dfcb7 100644 --- a/api/app/agents/chat/prompts_conversation.py +++ b/api/app/agents/chat/prompts_conversation.py @@ -1,5 +1,5 @@ """ -对话 Agent 提示词模板(精简:事实块 + 行为指引,由模型自行判断追问/长度/闲聊)。 +对话 Agent 提示词模板(场景化承接 + 细节深挖 + 人物串联)。 """ from typing import Dict, List, Optional @@ -14,6 +14,7 @@ from app.agents.chat.personas import ( get_interview_persona_tone_hint, normalize_interview_persona, ) +from app.agents.state_schema import KnownFact, PersonaThread from app.agents.stage_constants import CHAT_STAGES, STAGE_DISPLAY_ZH, STAGE_ERA_HINTS from app.core.config import settings @@ -50,7 +51,6 @@ def _compact_era_hint( birth_year: int | None = None, era_place: str = "", ) -> str: - """单行时代联想,可选附在进度后。出生年与地点由调用方从用户资料结构化传入。""" if not birth_year: return "" @@ -113,8 +113,9 @@ def get_opening_prompt( f"## 当前建议话题({stage_name})\n可以从中选一个来问:{topics_str}" ) task_question = ( - "2. 接着问一个**具体、好回答**的问题,引导用户开始分享;" + "2. 接着问一个**具体、好回答、有画面感**的问题,引导用户开始分享;" "优先落在上述还未聊透的方向上。不要问太宽泛的「有什么想聊的」。" + "好问题举例:「说到童年,你脑海里最先蹦出来的是哪个画面?」" ) else: topics_heading = ( @@ -141,7 +142,7 @@ def get_opening_prompt( else: opening_style_rules = ( "## 风格\n" - "- 像微信短聊:口语、自然;可轻快但不要排比和长段文学描写。\n" + "- 像微信短聊:口语、自然、温暖;可轻快,允许带一点画面感,但不要排比和长段文学描写。\n" ) profile_lines: List[str] = [] @@ -164,7 +165,7 @@ def get_opening_prompt( opening_head = ( "你是「岁月知己」。用户刚进对话,**还没说话**,请你先开口。" - "**短、像微信**,一两句问候 + 一个具体问题即可,不要排比、不要文学描写。\n\n" + "像老朋友打招呼,两三句问候 + 一个有画面感的具体问题即可,不要排比、不要长段文学描写。\n\n" ) if bv_open != "default": opening_head = ( @@ -191,7 +192,7 @@ def get_opening_prompt( ## 任务 1. 简短问候。 {task_question} -3. 自然、温暖,但**字数要少**。 +3. 自然、温暖。 {era_opening_line} ## 格式 - 可用 [SPLIT] 分成最多 2 条;或一条里「问候 + 问题」。 @@ -214,6 +215,9 @@ def get_guided_conversation_prompt( occupation: str = "", profile_birth_year: Optional[int] = None, profile_era_place: str = "", + known_facts: list[KnownFact] | None = None, + persona_threads: list[PersonaThread] | None = None, + recent_questions: list[str] | None = None, ) -> str: """生成状态感知的对话提示词;用户原话仅以 HumanMessage 传入,不写入本 system 文本。""" persona_key = normalize_interview_persona(persona) @@ -241,8 +245,8 @@ def get_guided_conversation_prompt( for key, value in filled_slots.items(): readable_key = SLOT_NAME_MAP.get(key, key) filled_info.append( - f"{readable_key}: {value[:50]}..." - if len(value) > 50 + f"{readable_key}: {value[:80]}..." + if len(value) > 80 else f"{readable_key}: {value}" ) filled_slots_str = "\n".join(filled_info) if filled_info else "刚开始聊" @@ -293,6 +297,41 @@ def get_guided_conversation_prompt( if user_info_parts: user_info_section = "## 用户信息\n" + "\n".join(user_info_parts) + "\n\n" + known_fact_lines: list[str] = [] + for fact in (known_facts or [])[-10:]: + line = fact.prompt_line().strip() + if line: + known_fact_lines.append(f"- {line}") + known_fact_section = "" + if known_fact_lines: + known_fact_section = ( + "## 已确认事实(这些已知,不要再回头确认)\n" + + "\n".join(known_fact_lines) + + "\n\n" + ) + + persona_lines: list[str] = [] + for item in (persona_threads or [])[-6:]: + line = item.prompt_line().strip() + if line: + persona_lines.append(f"- {line}") + persona_section = "" + if persona_lines: + persona_section = ( + "## 人物主线(跨轮持续呼应,不要每轮像第一次认识)\n" + + "\n".join(persona_lines) + + "\n\n" + ) + + recent_question_lines = [str(x).strip() for x in (recent_questions or [])[-4:] if str(x).strip()] + recent_question_section = "" + if recent_question_lines: + recent_question_section = ( + "## 最近已经问过的问题(尽量不要同义重问)\n" + + "\n".join(f"- {x}" for x in recent_question_lines) + + "\n\n" + ) + memory_section = "" mem_trim = (memory_evidence_text or "").strip() if mem_trim: @@ -307,28 +346,47 @@ def get_guided_conversation_prompt( progress_block = f"## 进度\n{progress_str}\n" if progress_str else "" era_block = f"## 时代与氛围参考\n{era_line}\n" if era_line else "" - return f"""你是「岁月知己」,像老朋友陪用户聊人生。短句为主,像微信聊天。{tone_line} + output_rules = chat_output_rules() + + return f"""你是「岁月知己」,像最懂我的老朋友。{tone_line} {topic_desc} -{user_info_section}## 当前对话状态 +{user_info_section}{known_fact_section}{persona_section}{recent_question_section}## 当前对话状态 已聊: {filled_slots_str} 还可聊的方向:{empty_slots_str} -{progress_block}{era_block}{memory_section}## 你要做的 -- **先接住对方**——一句真诚回应,不要写成总结或讲评。 -- **共情与轻量自我表露**:在接住的基础上,可用**一两句极短**的第一人称情绪承接(不展开成故事),**不得**编造具体时间、地点、人物与事件等你不知道的细节。 -- **意义向深挖(看准时机)**:当对方已讲出较具体的情节、人或选择时,可温和多问一层——当时怎么看这件事、后来有没有反过来影响性格或抉择;与「还可聊的方向」并存时,优先用这类意义问题**补缺口**,而非机械换话题。**情绪仍浓时**只承接、不深问。 -- 你自己判断该追问还是只承接:有新线头就顺着问一个具体的事;情绪浓就好好接住、不必急着追问;明显闲聊就陪聊;用户只说「嗯」「对」则结合上文承接或换个角度。 -- 可泛泛接话以承接氛围或感受,但不可编造具体人名、时间、事件等你不知道的细节。 -- 不要重复上一轮问过的事;用户跳到别的人生阶段,跟着聊,别硬拉回。 -- 追问与承接服务于人生故事素材,但不要让对方觉得在走审问式流程;**最多**抛一个具体问题,也可以不追问。 -- 可用 [SPLIT] 分成**最多 2 条**消息。 +{progress_block}{era_block}{memory_section}## 回复策略(按顺序执行,每步都要做到) -## 不要做的 -{chat_output_rules()} +### 第一步:先接住——让对方觉得你真的听进去了 +- 用对方刚说的**那个具体细节**回应,不要写成泛泛的"听起来很好"。 +- 好的接法:借用对方话里的意象往下走一步,例如对方说"烤红薯",你可以说"那种外面焦焦的、掰开冒热气的感觉"。 +- 允许一两句带画面感或感官细节的短描写(声音、气味、温度、触感),但不要编造对方没说的具体事实。 +- 不要用总结腔("听起来你的童年很快乐")或采访腔("我注意到"),要用**对话腔**("那种…的感觉,现在想起来都觉得…")。 + +### 第二步:再深挖——顺着这个细节往里走,不要跳到新话题 +- 追问要从对方**刚说的那个画面里**长出来,而不是跳到一个泛泛的新问题。 +- **好的追问**举例:"你们烤红薯的时候是在田埂边生火吗?""那时候带头的是谁?""后来再也没那样烤过吗?" +- **差的追问**举例:"你们还玩什么?""你印象最深的是什么?""那时候开心吗?"——这些太泛,任何人都能回答。 +- 如果对方情绪正浓(激动、感慨、哽咽),只接住,不提问。 +- 不要一次问两个问题;**最多一个**,也可以不问,只承接。 + +### 第三步:串联——把这轮和之前的记忆连起来 +- 若「已确认事实」或上文里已经有答案,不要再确认,直接用。 +- 若「人物主线」有线索,尝试自然接上(例如:"你之前说训练的时候也是这股劲儿")。 +- 不要每轮都像第一次见面。 + +## 绝对不要做的 +- 不要重复上一轮或「最近已经问过的问题」里的事。 +- 不要把用户没说的具体人名、时间、地点当事实说出来。 +- 不要用 Markdown、括号旁白、策略说明。 +- 不要连发多个问题。 +- 不要用"我注意到""我想了解""你觉得呢"这类采访模板。 +- {output_rules} +- 用户跳到别的人生阶段,跟着聊,别硬拉回。 +- 可用 [SPLIT] 分成**最多 2 条**消息。 直接输出(仅自然口语,无 Markdown,无任何括号前缀或旁白):""" diff --git a/api/app/agents/chat/prompts_profile.py b/api/app/agents/chat/prompts_profile.py index d81e23f..b175938 100644 --- a/api/app/agents/chat/prompts_profile.py +++ b/api/app/agents/chat/prompts_profile.py @@ -4,7 +4,7 @@ from typing import Dict, List, Optional -from app.agents.chat.output_rules import chat_output_rules +from app.agents.chat.output_rules import chat_output_rules, chat_voice_style PROFILE_FIELD_NAMES = { "birth_year": "出生年份", @@ -22,12 +22,14 @@ def get_profile_greeting_prompt(missing_fields: List[str], nickname: str = "") - missing_str = "、".join(missing_names) name_part = f",{nickname}" if nickname else "" - return f"""你是「岁月知己」,一位温暖真诚的人生故事访谈者。你正在和用户初次见面{name_part}。 + return f"""你是「岁月知己」,像最懂我的老朋友。你正在和用户初次见面{name_part}。 + +{chat_voice_style()} 在正式聊人生故事之前,你需要先了解一些基本信息。还需要了解的信息有:{missing_str}。 ## 你的任务 -用自然、亲切的方式,像老朋友聊天一样,向用户询问这些基础信息。 +用自然、亲切的方式,像老朋友聊天一样,向用户询问这些基础信息。如果用户已经开始讲回忆,先接住他的故事,再自然地穿插资料问题。 ## 规则 1. 不要一次问所有问题,每次只问 1-2 个 @@ -42,7 +44,6 @@ def get_profile_greeting_prompt(missing_fields: List[str], nickname: str = "") - ## 回复格式 - 如果内容较多,可以用 [SPLIT] 分隔成多条消息 -- 像微信聊天一样自然 直接输出你要说的话:""" @@ -104,17 +105,21 @@ def get_profile_followup_prompt( if interview_stage_hint else "问一个与**用户刚才关注点**或人生故事相关的**具体、好回答**的问题作为开场。" ) - return f"""你是「岁月知己」。用户的基本信息已经收集完毕: + return f"""你是「岁月知己」,像最懂我的老朋友。用户的基本信息已经收集完毕: {filled_str} -用户本轮消息在对话末尾。请对用户的回答做出温暖的回应,然后自然地过渡到人生故事的访谈。 +{chat_voice_style()} + +用户本轮消息在对话末尾。先接住用户刚说的那个细节(带一点画面感),然后自然地过渡到人生故事的访谈。 过渡语自拟,勿机械套话;{stage_hint} **不要**默认只问童年,除非用户刚才聊的正是童年。 回复格式:多条消息用 [SPLIT] 分隔。 直接输出你要说的话:""" - return f"""你是「岁月知己」,正在和用户聊天收集基本信息。 + return f"""你是「岁月知己」,像最懂我的老朋友。你正在和用户聊天,同时自然地了解一些基本信息。 + +{chat_voice_style()} ## 已知信息(严禁再次询问以下任何一项) {filled_str} @@ -125,8 +130,8 @@ def get_profile_followup_prompt( 用户本轮原话在历史里(末尾 HumanMessage),勿在脑中丢开。 ## 你怎么说 -1. **先接住**:对用户说的内容做自然回应,像朋友在听。 -2. **话题优先**:若用户正在讲一段故事、回忆或情绪,**优先**顺着问一个与**当前话题**相关的具体小问题;不要为凑字段打断叙事。 +1. **先接住**:用对方刚说的那个具体细节回应,带一点画面感,像朋友在跟着想象。不要写成泛泛的"听起来很好"。 +2. **话题优先**:若用户正在讲一段故事、回忆或情绪,**优先**顺着那个画面往里走一层;不要为凑字段打断叙事。 3. **资料穿插**:仅当用户本轮主要在确认、闲聊或话题与缺失资料完全无关时,再在末尾**温和插入 0~1 个**「还需要了解」里的问题。 4. **轮换**:若上一轮你已就某一类资料追问过(见历史里助手发言),本轮**不要再问同一类**;改问其他缺失项,或本轮只承接、不提资料。 5. 每次最多 **1~2 个**资料相关问点;能用推断就不要重复确认已知地/年。 diff --git a/api/app/agents/memoir/orchestrator.py b/api/app/agents/memoir/orchestrator.py index d6be64b..7afa8a7 100644 --- a/api/app/agents/memoir/orchestrator.py +++ b/api/app/agents/memoir/orchestrator.py @@ -76,7 +76,7 @@ class MemoirOrchestrator: segment_chapter_category: Dict[str, str] = {} classify_extract_llm = llm_fast if llm_fast is not None else llm - # 仅 MEMOIR_PHASE1_BATCH_LLM_ENABLED=true 时走批处理;关则与旧版一致逐段(含多段一批) + # batch 路径为默认主路径(需 LLM + 开关),失败自动回退逐段 use_batch = ( bool(segments) and classify_extract_llm is not None @@ -84,12 +84,17 @@ class MemoirOrchestrator: ) if use_batch: try: - return self._prepare_batches_via_batch_llm( + result = self._prepare_batches_via_batch_llm( segments=segments, state=state, classify_extract_llm=classify_extract_llm, update_slot=update_slot, ) + logger.info( + "event=phase1_batch_path_used segment_count={}", + len(segments), + ) + return result except Exception as e: logger.warning( "MemoirOrchestrator.prepare_batches batch LLM 失败,回退逐段: {}", diff --git a/api/app/agents/state_schema.py b/api/app/agents/state_schema.py index c997000..7f7c81b 100644 --- a/api/app/agents/state_schema.py +++ b/api/app/agents/state_schema.py @@ -18,6 +18,36 @@ class SlotData(BaseModel): segment_ids: List[str] = Field(default_factory=list) +class KnownFact(BaseModel): + """会话级已知事实:供 prompt 明确声明“不要再问这些”。""" + + label: str + value: str + source: str = "" + stage: str = "" + slot_name: str | None = None + + def prompt_line(self) -> str: + prefix = f"{self.label}:".strip(":") + if prefix: + return f"{prefix} {self.value}".strip() + return self.value.strip() + + +class PersonaThread(BaseModel): + """跨轮人物主线:用于持续呼应用户的稳定特质与动机。""" + + trait: str + evidence: str = "" + source: str = "" + stage: str = "" + + def prompt_line(self) -> str: + if self.evidence: + return f"{self.trait}(依据:{self.evidence})" + return self.trait + + class MemoirStateSchema(BaseModel): """回忆录状态""" @@ -25,6 +55,9 @@ class MemoirStateSchema(BaseModel): current_stage: str covered_stages: List[str] slots: Dict[str, Dict[str, SlotData]] + known_facts: List[KnownFact] = Field(default_factory=list) + persona_threads: List[PersonaThread] = Field(default_factory=list) + recent_questions: List[str] = Field(default_factory=list) def empty_slots_for_current_stage(self) -> List[str]: stage_slots = self.slots.get(self.current_stage, {}) @@ -34,6 +67,18 @@ class MemoirStateSchema(BaseModel): empty_keys.append(key) return empty_keys + def prompt_empty_slots_for_stage(self, stage: str) -> List[str]: + """生成 prompt 时可追问的槽位,排除已被 known_facts 覆盖的方向。""" + blocked = { + fact.slot_name + for fact in self.known_facts + if fact.slot_name and (not fact.stage or fact.stage == stage) + } + return [key for key in self.empty_slots_for_stage(stage) if key not in blocked] + + def prompt_empty_slots_for_current_stage(self) -> List[str]: + return self.prompt_empty_slots_for_stage(self.current_stage) + def empty_slots_for_stage(self, stage: str) -> List[str]: """获取指定阶段的空槽位""" stage_slots = self.slots.get(stage, {}) @@ -61,6 +106,33 @@ class MemoirStateSchema(BaseModel): } return coverage + def prompt_known_fact_lines(self, *, limit: int = 10) -> List[str]: + xs: List[str] = [] + for fact in self.known_facts[-limit:]: + line = fact.prompt_line().strip() + if line: + xs.append(line) + return xs + + def prompt_persona_thread_lines(self, *, limit: int = 6) -> List[str]: + xs: List[str] = [] + for item in self.persona_threads[-limit:]: + line = item.prompt_line().strip() + if line: + xs.append(line) + return xs + + def prompt_recent_question_lines(self, *, limit: int = 4) -> List[str]: + out: List[str] = [] + seen: set[str] = set() + for item in self.recent_questions[-limit:]: + s = str(item).strip() + if not s or s in seen: + continue + seen.add(s) + out.append(s) + return out + # 与 stage_constants.CHAT_STAGES 同一顺序;list() 避免与元组共享可变别名 DEFAULT_STAGE_ORDER: list[str] = list(CHAT_STAGES) diff --git a/api/app/core/celery_broker_dev.py b/api/app/core/celery_broker_dev.py new file mode 100644 index 0000000..4afba0a --- /dev/null +++ b/api/app/core/celery_broker_dev.py @@ -0,0 +1,65 @@ +"""开发环境:可选在 API 启动时清空 Celery broker 队列(queue_purge,不 FLUSH 整库)。""" + +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING + +from app.core.config import settings +from app.core.logging import get_logger + +if TYPE_CHECKING: + from redis.asyncio import Redis + +logger = get_logger(__name__) + +_PURGE_GATE_KEY = "life_echo:celery_dev_broker_purge_gate" +_PURGE_GATE_TTL_SEC = 25 + + +def _is_production_environment() -> bool: + env = (settings.app_environment or "").strip().lower() + return env in ("production", "prod") + + +def _purge_celery_queues_sync() -> int: + from app.tasks.celery_app import celery_app + + names = set(celery_app.amqp.queues.keys()) + if not names: + return 0 + total = 0 + with celery_app.connection_for_write() as conn: + channel = conn.default_channel + for q in sorted(names): + try: + n = channel.queue_purge(q) + total += int(n or 0) + except conn.channel_errors: + logger.debug("Celery queue_purge 跳过 queue={}", q) + return total + + +async def maybe_purge_celery_broker_on_startup(redis_client: Redis) -> None: + """在已连接的 Redis 上抢门闩后清空已知任务队列;生产环境永不执行。""" + if _is_production_environment(): + return + if not settings.celery_purge_broker_on_startup: + return + got = await redis_client.set( + _PURGE_GATE_KEY, + "1", + nx=True, + ex=_PURGE_GATE_TTL_SEC, + ) + if not got: + logger.debug("Celery broker 清空跳过(短时门闩已由其他进程设置)") + return + try: + purged = await asyncio.to_thread(_purge_celery_queues_sync) + if purged: + logger.info("开发环境已清空 Celery broker 队列(移除 {} 条消息)", purged) + else: + logger.info("开发环境 Celery broker 无积压消息(已尝试 purge)") + except Exception as e: + logger.warning("清空 Celery broker 队列失败(可忽略): {}", e) diff --git a/api/app/core/config.py b/api/app/core/config.py index c0a8c66..6050cc2 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -9,7 +9,7 @@ Docker / 服务端由镜像与 compose 注入进程环境;此处仅固定读 import secrets -from pydantic import Field, field_validator +from pydantic import AliasChoices, Field, field_validator from pydantic_settings import BaseSettings, SettingsConfigDict @@ -34,6 +34,15 @@ class Settings(BaseSettings): redis_url: str = "redis://localhost:6379/0" redis_session_ttl: int = 86400 + # ── Runtime / Celery 开发体验 ───────────────────────────── + # APP_ENV:本地默认 development;Docker 生产栈请设为 production + app_environment: str = Field( + default="development", + validation_alias=AliasChoices("APP_ENV", "APP_ENVIRONMENT"), + ) + # 非 production 且为 True 时,在 main/internal_main 连接 Redis 后清空 Celery 队列(不 FLUSHDB,不影响会话键) + celery_purge_broker_on_startup: bool = False + # ── Auth / JWT ──────────────────────────────────────────── secret_key: str = Field(default_factory=lambda: secrets.token_urlsafe(32)) algorithm: str = "HS256" @@ -57,10 +66,10 @@ class Settings(BaseSettings): embedding_model: str = "embedding-3" # ── Chat 访谈(token 上限 + 代码截断,见 reply_limits)── - chat_interview_max_tokens: int = 380 + chat_interview_max_tokens: int = 512 chat_interview_max_segments: int = 2 - chat_interview_max_chars_per_segment: int = 260 - chat_opening_max_tokens: int = 256 + chat_interview_max_chars_per_segment: int = 380 + chat_opening_max_tokens: int = 380 chat_profile_followup_max_tokens: int = 280 # Redis 全量历史仅用于 turn 计数;注入 LLM 时截取最近若干轮与字符预算 chat_history_max_pairs: int = Field(default=15, ge=1, le=500) @@ -99,7 +108,7 @@ class Settings(BaseSettings): chat_profile_max_turns: int = Field(default=8, ge=1, le=500) # Memoir Phase1:多 segment 一批一次 LLM 完成抽取+章节分类(失败回退逐段);单段且关时仍逐段 - memoir_phase1_batch_llm_enabled: bool = False + memoir_phase1_batch_llm_enabled: bool = True memoir_phase1_batch_llm_max_tokens: int = Field(default=4096, ge=512, le=32_768) # Memoir agents:`invoke_json_object` / `llm_json_call` 的 max_tokens(原硬编码迁至配置) memoir_extraction_max_tokens: int = Field(default=1024, ge=64, le=8192) @@ -177,6 +186,15 @@ class Settings(BaseSettings): celery_log_level: str = "" httpx_log_level: str = "" + @field_validator("celery_purge_broker_on_startup", mode="before") + @classmethod + def _coerce_celery_purge_broker_on_startup(cls, v: object) -> bool: + if isinstance(v, bool): + return v + if v is None: + return False + return str(v).strip().lower() in ("1", "true", "yes", "on") + @field_validator("log_agent_verbose", mode="before") @classmethod def _coerce_log_agent_verbose(cls, v: object) -> bool: @@ -276,6 +294,9 @@ class Settings(BaseSettings): memoir_recompose_retry_on_lock_contention: bool = True # Phase2 立即派发使用固定 task_id,减少同类目重复入队(超时任务仍用独立 id) memoir_phase2_singleflight_immediate: bool = True + # True:Phase2 首稿后异步运行质量增强(fidelity recheck、标题润色、LLM 归一) + memoir_quality_pass_enabled: bool = True + memoir_quality_pass_delay_seconds: int = Field(default=5, ge=0, le=300) # ── Memory 检索与富化 ───────────────────────────────────── # True:query 为空时仍返回 rolling 摘要 + 最近事实/时间线(无 chunk 向量检索) @@ -326,6 +347,14 @@ class Settings(BaseSettings): eval_judge_base_url: str = "https://open.bigmodel.cn/api/paas/v4" eval_judge_model: str = "glm-5" eval_judge_temperature: float = 0.3 + # 评测评审:DeepSeek(OpenAI 兼容);默认 deepseek-reasoner 即官网 R1 + eval_judge_deepseek_model: str = "deepseek-reasoner" + eval_judge_deepseek_context_window_tokens: int = Field( + default=64_000, + ge=4096, + le=2_000_000, + description="DeepSeek 评审专用上下文预算(用于 transcript 截断;与 GLM 200K 分离)", + ) # GLM-5 输入上下文 200K(https://docs.bigmodel.cn) eval_judge_context_window_tokens: int = Field( default=200_000, ge=4096, le=2_000_000 diff --git a/api/app/core/dependencies.py b/api/app/core/dependencies.py index 779ed69..b8fa9d9 100644 --- a/api/app/core/dependencies.py +++ b/api/app/core/dependencies.py @@ -5,8 +5,11 @@ - Port DI factory:get_sms_sender / get_llm_provider / get_tts_provider / ... """ +from dataclasses import dataclass from functools import lru_cache -from typing import Optional +from typing import Any, Literal, Optional + +EvalJudgeProvider = Literal["zhipu", "deepseek"] from fastapi import Depends, HTTPException, status from fastapi.security import OAuth2PasswordBearer @@ -185,23 +188,84 @@ async def get_current_user( return user -def get_eval_judge_langchain_llm(): - """智谱 GLM-5(OpenAI 兼容 Chat Completions)用于评审 JSON;与访谈生产模型分离。""" - from langchain_openai import ChatOpenAI - - api_key = (settings.eval_judge_api_key or settings.zhipu_api_key or "").strip() - if not api_key: - return None - base = (settings.eval_judge_base_url or "").rstrip("/") +def _normalize_openai_compatible_base_url(raw: str, *, fallback: str) -> str: + base = (raw or "").strip().rstrip("/") or fallback for suffix in ("/v1/chat/completions", "/v1"): if base.endswith(suffix): base = base[: -len(suffix)] - return ChatOpenAI( - api_key=api_key, - base_url=base or "https://open.bigmodel.cn/api/paas/v4", - model=settings.eval_judge_model or "glm-5", - temperature=settings.eval_judge_temperature, + return base + + +@dataclass(slots=True, frozen=True) +class EvalJudgeLlmSpec: + """评测台评审:OpenAI 兼容 Chat Completions 的一条装配结果。""" + + llm: Any | None + provider: EvalJudgeProvider + resolved_model: str + context_window_tokens: int + + +def build_eval_judge_llm_spec( + provider: EvalJudgeProvider = "zhipu", + judge_model: str | None = None, +) -> EvalJudgeLlmSpec | None: + """按供应商装配 ChatOpenAI;密钥缺失时返回 None(llm 为 None)。""" + from langchain_openai import ChatOpenAI + + want = (judge_model or "").strip() + if provider == "deepseek": + api_key = (settings.deepseek_api_key or settings.llm_api_key or "").strip() + if not api_key: + return None + base = _normalize_openai_compatible_base_url( + settings.deepseek_base_url, + fallback="https://api.deepseek.com", + ) + model = want or ( + settings.eval_judge_deepseek_model or "deepseek-reasoner" + ) + ctx = int(settings.eval_judge_deepseek_context_window_tokens) + return EvalJudgeLlmSpec( + llm=ChatOpenAI( + api_key=api_key, + base_url=base, + model=model, + temperature=settings.eval_judge_temperature, + ), + provider="deepseek", + resolved_model=model, + context_window_tokens=ctx, + ) + + api_key = ( + settings.eval_judge_api_key or settings.zhipu_api_key or "" + ).strip() + if not api_key: + return None + base = _normalize_openai_compatible_base_url( + settings.eval_judge_base_url, + fallback="https://open.bigmodel.cn/api/paas/v4", ) + model = want or (settings.eval_judge_model or "glm-5") + ctx = int(settings.eval_judge_context_window_tokens) + return EvalJudgeLlmSpec( + llm=ChatOpenAI( + api_key=api_key, + base_url=base, + model=model, + temperature=settings.eval_judge_temperature, + ), + provider="zhipu", + resolved_model=model, + context_window_tokens=ctx, + ) + + +def get_eval_judge_langchain_llm(): + """兼容:等价于智谱供应商下的 `build_eval_judge_llm_spec(\"zhipu\", None).llm`。""" + spec = build_eval_judge_llm_spec("zhipu", None) + return spec.llm if spec else None async def get_optional_user( diff --git a/api/app/features/evaluation/admin_service.py b/api/app/features/evaluation/admin_service.py index a58130c..10fd3fa 100644 --- a/api/app/features/evaluation/admin_service.py +++ b/api/app/features/evaluation/admin_service.py @@ -1,281 +1,21 @@ -"""内部评测 REST 编排:事务与业务规则;数据访问经 repo。""" +"""内部评测:仅保留 Playground fixtures 列表等轻量能力。""" from __future__ import annotations -import json -from dataclasses import dataclass -from typing import Any - from sqlalchemy.ext.asyncio import AsyncSession -from app.features.evaluation import repo as eval_repo -from app.features.evaluation.errors import ( - EvaluationBadRequestError, - EvaluationNotFoundError, -) -from app.features.evaluation.importers.script_json import parse_script_json -from app.features.evaluation.importers.user_export_markdown import ( - extract_user_utterances_from_export_md, -) -from app.features.evaluation.models import ( - EvalCase, - EvalExperiment, - EvalGateVerdict, - EvalRegressionSet, - EvalRun, - EvalRunTurn, - EvalVersion, -) -from app.features.evaluation.presenters import run_out -from app.features.evaluation.schemas import ( - CaseCreate, - ExperimentCreate, - ImportJsonCaseBody, - ImportMarkdownBody, - RegressionSetCreate, - SessionEvalRunItem, - SessionEvalRunsOut, - SnapshotFromConversationBody, - VersionCreate, -) -from app.features.evaluation.session_catalog_service import SessionCatalogService from app.features.evaluation.user_export_fixtures import ( list_user_export_fixture_names as list_user_export_md_filenames, ) from app.features.evaluation.user_export_fixtures import ( read_user_export_fixture, ) -from app.tasks.evaluation_tasks import run_eval_experiment_task - - -@dataclass(frozen=True) -class ExperimentDetailBundle: - experiment: EvalExperiment - run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] - gate: EvalGateVerdict | None class EvaluationAdminService: def __init__(self, db: AsyncSession) -> None: self._db = db - async def list_regression_sets(self) -> list[EvalRegressionSet]: - return await eval_repo.list_regression_sets(self._db) - - async def create_regression_set( - self, body: RegressionSetCreate - ) -> EvalRegressionSet: - row = await eval_repo.create_regression_set( - self._db, name=body.name, description=body.description - ) - await self._db.commit() - await self._db.refresh(row) - return row - - async def list_cases(self, set_id: str) -> list[EvalCase]: - parent = await eval_repo.get_regression_set(self._db, set_id) - if not parent: - raise EvaluationNotFoundError("regression set not found") - return await eval_repo.list_cases(self._db, set_id) - - async def create_case(self, set_id: str, body: CaseCreate) -> EvalCase: - parent = await eval_repo.get_regression_set(self._db, set_id) - if not parent: - raise EvaluationNotFoundError("regression set not found") - row = await eval_repo.create_case( - self._db, - regression_set_id=set_id, - user_utterances=body.user_utterances, - title=body.title, - source_conversation_id=body.source_conversation_id, - source_user_id=body.source_user_id, - reference_memoir_markdown=body.reference_memoir_markdown, - is_protected=body.is_protected, - meta=body.meta, - ) - await self._db.commit() - await self._db.refresh(row) - return row - - async def snapshot_from_conversation( - self, - set_id: str, - conversation_id: str, - body: SnapshotFromConversationBody, - ) -> EvalCase: - parent = await eval_repo.get_regression_set(self._db, set_id) - if not parent: - raise EvaluationNotFoundError("regression set not found") - catalog = SessionCatalogService(self._db) - tr = await catalog.get_transcript(conversation_id) - if not tr: - raise EvaluationNotFoundError("conversation not found") - utterances = ( - tr.user_utterances_from_messages - if body.use_messages - else tr.user_utterances_from_segments - ) - if not utterances: - raise EvaluationBadRequestError("no user utterances in session") - row = await eval_repo.create_case( - self._db, - regression_set_id=set_id, - user_utterances=utterances, - title=body.title, - source_conversation_id=conversation_id, - source_user_id=tr.user_id, - is_protected=body.is_protected, - meta={"source": "conversation_snapshot", "use_messages": body.use_messages}, - ) - await self._db.commit() - await self._db.refresh(row) - return row - - async def import_markdown_case( - self, set_id: str, body: ImportMarkdownBody - ) -> EvalCase: - parent = await eval_repo.get_regression_set(self._db, set_id) - if not parent: - raise EvaluationNotFoundError("regression set not found") - utterances = extract_user_utterances_from_export_md(body.markdown) - if not utterances: - raise EvaluationBadRequestError("no user lines parsed from markdown") - row = await eval_repo.create_case( - self._db, - regression_set_id=set_id, - user_utterances=utterances, - title=body.title, - is_protected=body.is_protected, - meta={"source": "markdown_import"}, - ) - await self._db.commit() - await self._db.refresh(row) - return row - - async def import_json_case(self, body: ImportJsonCaseBody) -> EvalCase: - parent = await eval_repo.get_regression_set(self._db, body.regression_set_id) - if not parent: - raise EvaluationNotFoundError("regression set not found") - meta_extra: dict[str, Any] - if body.utterances: - utt = [str(u).strip() for u in body.utterances if str(u).strip()] - meta_extra = {} - elif body.raw_json is not None: - raw = body.raw_json - payload_str = json.dumps(raw, ensure_ascii=False) - utt, meta_extra = parse_script_json(payload_str) - else: - raise EvaluationBadRequestError("utterances or raw_json required") - if not utt: - raise EvaluationBadRequestError("empty utterances") - row = await eval_repo.create_case( - self._db, - regression_set_id=body.regression_set_id, - user_utterances=utt, - title=body.title, - is_protected=body.is_protected, - meta={"source": "json_import", **meta_extra}, - ) - await self._db.commit() - await self._db.refresh(row) - return row - - async def list_versions(self) -> list[EvalVersion]: - return await eval_repo.list_versions(self._db) - - async def create_version(self, body: VersionCreate) -> EvalVersion: - row = await eval_repo.create_version( - self._db, - name=body.name, - runner_kind=body.runner_kind, - config_json=body.config_json, - ) - await self._db.commit() - await self._db.refresh(row) - return row - - async def list_experiments(self, *, limit: int) -> list[EvalExperiment]: - return await eval_repo.list_experiments(self._db, limit=limit) - - async def list_session_evaluation_runs( - self, conversation_id: str - ) -> SessionEvalRunsOut: - rows = await eval_repo.list_runs_for_source_conversation( - self._db, source_conversation_id=conversation_id - ) - items: list[SessionEvalRunItem] = [] - for run, _case, exp in rows: - turns = await eval_repo.list_turns(self._db, run.id) - items.append( - SessionEvalRunItem( - experiment_name=exp.name, - run=run_out(run, turns), - ) - ) - return SessionEvalRunsOut(conversation_id=conversation_id, items=items) - - async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment: - rs = await eval_repo.get_regression_set(self._db, body.regression_set_id) - if not rs: - raise EvaluationNotFoundError("regression set not found") - bv = await eval_repo.get_version(self._db, body.baseline_version_id) - cv = await eval_repo.get_version(self._db, body.candidate_version_id) - if not bv or not cv: - raise EvaluationNotFoundError("version not found") - row = await eval_repo.create_experiment( - self._db, - name=body.name, - regression_set_id=body.regression_set_id, - baseline_version_id=body.baseline_version_id, - candidate_version_id=body.candidate_version_id, - rubric_pack=body.rubric_pack, - composite_weights_json=body.composite_weights_json, - ) - await self._db.commit() - await self._db.refresh(row) - return row - - async def get_experiment_detail(self, experiment_id: str) -> ExperimentDetailBundle: - exp = await eval_repo.get_experiment(self._db, experiment_id) - if not exp: - raise EvaluationNotFoundError("experiment not found") - runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id) - run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] = [] - for r in runs: - turns = await eval_repo.list_turns(self._db, r.id) - run_rows.append((r, turns)) - gv = await eval_repo.get_gate_verdict(self._db, experiment_id) - return ExperimentDetailBundle(experiment=exp, run_rows=run_rows, gate=gv) - - async def enqueue_experiment_run(self, experiment_id: str) -> EvalExperiment: - exp = await eval_repo.get_experiment(self._db, experiment_id) - if not exp: - raise EvaluationNotFoundError("experiment not found") - run_eval_experiment_task.delay(experiment_id) - await self._db.refresh(exp) - return exp - - async def experiment_stream_snapshot( - self, experiment_id: str - ) -> dict[str, Any] | None: - from app.features.evaluation.schemas import GateVerdictOut - - exp = await eval_repo.get_experiment(self._db, experiment_id) - if not exp: - return None - runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id) - run_payload = [] - for r in runs: - turns = await eval_repo.list_turns(self._db, r.id) - run_payload.append(run_out(r, turns).model_dump()) - gv = await eval_repo.get_gate_verdict(self._db, experiment_id) - return { - "experiment_id": experiment_id, - "status": exp.status, - "runs": run_payload, - "gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None, - } - def list_user_export_fixture_names(self) -> list[str]: return list_user_export_md_filenames() diff --git a/api/app/features/evaluation/candidate_runner.py b/api/app/features/evaluation/candidate_runner.py deleted file mode 100644 index 4cba4e4..0000000 --- a/api/app/features/evaluation/candidate_runner.py +++ /dev/null @@ -1,83 +0,0 @@ -"""独立候选回放:多轮 user 链式调用 LLM,不走路由 WS / ChatOrchestrator。""" - -from __future__ import annotations - -import time -from typing import Any - -from app.core.logging import get_logger - -logger = get_logger(__name__) - - -def _system_prompt_for_eval(version_config: dict | None) -> str: - cfg = version_config or {} - extra = (cfg.get("system_prompt_suffix") or "").strip() - base = """你是「岁月留书」老年友好访谈员。语气温暖、耐心,先承接情绪再 gently 追问事实与感受;回答简洁分段,避免术语。""" - if extra: - return f"{base}\n\n{extra}" - return base - - -def _model_override(version_config: dict | None) -> str | None: - if not version_config: - return None - m = version_config.get("model") - return str(m).strip() if m else None - - -class EvalCandidateRunner: - """使用 LangChain Chat 模型回放用户轮次。""" - - def __init__(self, llm: Any) -> None: - self._llm = llm - - async def replay_utterances( - self, - utterances: list[str], - *, - version_config: dict | None = None, - temperature: float = 0.7, - ) -> tuple[list[str], list[int]]: - """返回每轮 assistant 回复与耗时 ms。""" - if not self._llm: - raise RuntimeError(" replay: llm 未配置") - from langchain_core.messages import AIMessage, HumanMessage, SystemMessage - - sys_prompt = _system_prompt_for_eval(version_config) - model = _model_override(version_config) - lc_messages: list = [SystemMessage(content=sys_prompt)] - replies: list[str] = [] - latencies: list[int] = [] - bound = ( - self._llm.bind(model=model, temperature=temperature) - if model - else self._llm.bind(temperature=temperature) - ) - for u in utterances: - text = (u or "").strip() - if not text: - continue - lc_messages.append(HumanMessage(content=text)) - t0 = time.perf_counter() - result = await bound.ainvoke(lc_messages) - elapsed_ms = int((time.perf_counter() - t0) * 1000) - reply = str(getattr(result, "content", "") or "").strip() - replies.append(reply) - latencies.append(elapsed_ms) - lc_messages.append(AIMessage(content=reply)) - return replies, latencies - - -def simple_memoir_from_transcript(utterances: list[str], replies: list[str]) -> str: - """轻量成稿:供评审用占位(非生产叙事管线)。""" - lines = ["# 访谈摘录整理(评测占位稿)", ""] - for i, u in enumerate(utterances): - lines.append(f"## 片段 {i + 1}") - lines.append("") - lines.append(f"**用户:** {u.strip()}") - if i < len(replies): - lines.append("") - lines.append(f"**访谈者:** {replies[i].strip()}") - lines.append("") - return "\n".join(lines) diff --git a/api/app/features/evaluation/composite_score.py b/api/app/features/evaluation/composite_score.py new file mode 100644 index 0000000..d8a61f0 --- /dev/null +++ b/api/app/features/evaluation/composite_score.py @@ -0,0 +1,34 @@ +"""评测合成记分(与批量实验 Celery 解耦后的纯函数,供单测保留)。""" + +from __future__ import annotations + +from typing import Any + + +def composite_score( + conv: float | None, + mem: float | None, + weights: dict[str, Any] | None, +) -> float | None: + """合成总分;缺失的一侧不计为 0,避免把评审失败误标为极差。 + + 仅一侧有分:返回该侧原始分(不乘权重),表示当前 run 仅完成了部分评审维度。 + """ + w = weights or {} + wc = float(w.get("conversation", 0.5)) + wm = float(w.get("memoir", 0.5)) + has_c = conv is not None + has_m = mem is not None + if not has_c and not has_m: + return None + if has_c and has_m: + return float(wc) * float(conv) + float(wm) * float(mem) + if has_c: + return float(conv) + return float(mem) + + +# 兼容旧测试中的私有名 +_composite = composite_score + +__all__ = ["composite_score", "_composite"] diff --git a/api/app/features/evaluation/conversation_compare_summary.py b/api/app/features/evaluation/conversation_compare_summary.py new file mode 100644 index 0000000..766838b --- /dev/null +++ b/api/app/features/evaluation/conversation_compare_summary.py @@ -0,0 +1,176 @@ +"""Structured A/B compare summary for internal eval conversation judging.""" + +from __future__ import annotations + +from typing import Any + +from app.features.evaluation.judge_schemas import ConversationJudgeOutput + +_GROUP_KEYS: tuple[tuple[str, str], ...] = ( + ("emotion_score", "情绪与陪伴"), + ("information_score", "信息挖掘"), + ("persona_score", "人物建模"), + ("structure_score", "结构引导"), + ("question_score", "提问质量"), +) + +_LEAF_KEYS: tuple[tuple[str, str], ...] = ( + ("emotion_carry", "情绪承接"), + ("context_memory", "上下文记忆"), + ("rhythm_control", "节奏控制"), + ("persona_understanding", "人物理解"), + ("follow_up_depth", "追问深度"), + ("non_leading", "非引导性"), +) + +_REPEAT_ISSUE_MARKERS = ("重复盘问", "重复询问", "已答", "忽略上文", "同义重问") + + +def _round(x: float) -> float: + return round(float(x), 2) + + +def _issues_text(judge: ConversationJudgeOutput | None) -> list[str]: + if judge is None: + return [] + return [str(x).strip() for x in judge.major_issues if str(x).strip()] + + +def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool: + return any( + marker in issue + for issue in _issues_text(judge) + for marker in _REPEAT_ISSUE_MARKERS + ) + + +def build_conversation_compare_summary( + *, + baseline_judge: ConversationJudgeOutput | None, + replay_judge: ConversationJudgeOutput | None, + baseline_transcript: str, + replay_transcript: str, + conv_cap: int, + compare_cap_each: int, + fixture_filename: str | None = None, +) -> dict[str, Any]: + truncation = { + "baseline_chars": len((baseline_transcript or "").strip()), + "replay_chars": len((replay_transcript or "").strip()), + "conversation_cap_chars": int(conv_cap), + "compare_cap_each_chars": int(compare_cap_each), + "baseline_truncated_for_conversation": len((baseline_transcript or "").strip()) + > int(conv_cap), + "replay_truncated_for_conversation": len((replay_transcript or "").strip()) + > int(conv_cap), + "baseline_truncated_for_compare": len((baseline_transcript or "").strip()) + > int(compare_cap_each), + "replay_truncated_for_compare": len((replay_transcript or "").strip()) + > int(compare_cap_each), + } + + if not replay_judge: + return { + "fixture_filename": fixture_filename, + "mode": "single", + "truncation": truncation, + "gate": { + "status": "insufficient_data", + "reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"], + }, + } + + if not baseline_judge: + return { + "fixture_filename": fixture_filename, + "mode": "single", + "replay_total": _round(replay_judge.total_score), + "truncation": truncation, + "gate": { + "status": "single_side_only", + "reasons": ["当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"], + }, + } + + group_deltas = { + key: { + "label": label, + "baseline": _round(getattr(baseline_judge, key)), + "replay": _round(getattr(replay_judge, key)), + "delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)), + } + for key, label in _GROUP_KEYS + } + leaf_deltas = { + key: { + "label": label, + "baseline": _round(getattr(baseline_judge, key)), + "replay": _round(getattr(replay_judge, key)), + "delta": _round(getattr(replay_judge, key) - getattr(baseline_judge, key)), + } + for key, label in _LEAF_KEYS + } + + key_regressions = [ + v["label"] for v in leaf_deltas.values() if float(v["delta"]) <= -0.75 + ] + key_gains = [v["label"] for v in leaf_deltas.values() if float(v["delta"]) >= 0.75] + total_delta = _round(replay_judge.total_score - baseline_judge.total_score) + has_repeat_regression = _has_repeat_issue(replay_judge) + parity_passed = ( + total_delta >= -1.0 + and float(leaf_deltas["context_memory"]["delta"]) >= -0.5 + and float(leaf_deltas["emotion_carry"]["delta"]) >= -0.5 + and not has_repeat_regression + ) + surpass_passed = ( + total_delta >= 1.5 + and float(leaf_deltas["context_memory"]["delta"]) >= 0 + and float(leaf_deltas["persona_understanding"]["delta"]) >= 0 + and float(leaf_deltas["rhythm_control"]["delta"]) >= -0.25 + and not has_repeat_regression + ) + if surpass_passed: + status = "surpass" + elif parity_passed: + status = "parity" + else: + status = "regressed" + + reasons: list[str] = [] + if total_delta >= 1.5: + reasons.append("总分已显著超过基线。") + elif total_delta >= -1.0: + reasons.append("总分已基本追平基线。") + else: + reasons.append("总分仍明显落后基线。") + if has_repeat_regression: + reasons.append("回放侧仍出现重复盘问或忽略已知信息的风险。") + if key_regressions: + reasons.append(f"关键回落维度:{'、'.join(key_regressions[:4])}。") + if key_gains: + reasons.append(f"关键提升维度:{'、'.join(key_gains[:4])}。") + if truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"]: + reasons.append("A/B 对比稿使用了截断 transcript,长对话结论需结合逐轮评分复核。") + + return { + "fixture_filename": fixture_filename, + "mode": "ab", + "baseline_total": _round(baseline_judge.total_score), + "replay_total": _round(replay_judge.total_score), + "total_delta": total_delta, + "group_deltas": group_deltas, + "leaf_deltas": leaf_deltas, + "key_regressions": key_regressions, + "key_gains": key_gains, + "repeat_issue_detected": has_repeat_regression, + "truncation": truncation, + "gate": { + "status": status, + "parity_passed": parity_passed, + "surpass_passed": surpass_passed, + "reasons": reasons, + "golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。", + }, + } + diff --git a/api/app/features/evaluation/execution_service.py b/api/app/features/evaluation/execution_service.py deleted file mode 100644 index b505e9c..0000000 --- a/api/app/features/evaluation/execution_service.py +++ /dev/null @@ -1,474 +0,0 @@ -"""执行单次评测 run 与整实验(供 Celery / 内联调试)。""" - -from __future__ import annotations - -from datetime import datetime, timezone -from typing import Any - -from sqlalchemy.ext.asyncio import AsyncSession - -from app.core.config import settings -from app.core.db import AsyncSessionLocal -from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider -from app.core.logging import get_logger -from app.features.evaluation import repo as eval_repo -from app.features.evaluation.candidate_runner import ( - EvalCandidateRunner, - simple_memoir_from_transcript, -) -from app.features.evaluation.eval_trace_service import EvalTraceService -from app.features.evaluation.gate_report_service import gate_result_to_details -from app.features.evaluation.gating_service import compute_gate -from app.features.evaluation.judge_service import EvalJudgeService -from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion -from app.features.evaluation.transcript_for_judge import ( - assistant_text_for_eval_display, - format_eval_turn_block, -) - -logger = get_logger(__name__) - -_MAX_JUDGE_MARKDOWN_CHARS = 20_000 -_MAX_EVAL_CHAPTERS = 30 -_MAX_EVAL_STORIES = 40 -_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000 - - -def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str: - s = (text or "").strip() - if len(s) <= max_chars: - return s - return f"{s[:max_chars]}\n\n…(已截断供评审)" - - -def _composite( - conv: float | None, mem: float | None, weights: dict[str, Any] | None -) -> float | None: - """合成总分;缺失的一侧不计为 0,避免把评审失败误标为极差。 - - 仅一侧有分:返回该侧原始分(不乘权重),表示当前 run 仅完成了部分评审维度。 - """ - w = weights or {} - wc = float(w.get("conversation", 0.5)) - wm = float(w.get("memoir", 0.5)) - has_c = conv is not None - has_m = mem is not None - if not has_c and not has_m: - return None - if has_c and has_m: - return float(wc) * float(conv) + float(wm) * float(mem) - if has_c: - return float(conv) - return float(mem) - - -def _utterances_for_case(case: EvalCase) -> list[str]: - raw = case.user_utterances or [] - return [str(u).strip() for u in raw if str(u).strip()] - - -def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str: - s = (text or "").strip() - if len(s) <= max_chars: - return s - return f"{s[:max_chars]}\n\n…(访谈证据已截断)" - - -async def execute_eval_run( - db: AsyncSession, - *, - run: EvalRun, - case: EvalCase, - version: EvalVersion, -) -> None: - fresh_run = await db.get(EvalRun, run.id) - if not fresh_run: - return - if (fresh_run.status or "").lower() == "completed": - logger.info("eval run skip already completed run_id={}", fresh_run.id) - return - run = fresh_run - - if not settings.eval_execution_enabled: - await eval_repo.update_run( - db, - run, - status="failed", - error_message="EVAL_EXECUTION_ENABLED=false", - completed_at=datetime.now(timezone.utc), - ) - return - - utterances = _utterances_for_case(case) - if not utterances: - await eval_repo.update_run( - db, - run, - status="failed", - error_message="empty user_utterances", - completed_at=datetime.now(timezone.utc), - ) - return - - await eval_repo.update_run( - db, - run, - status="running", - started_at=datetime.now(timezone.utc), - error_message=None, - ) - await db.commit() - - provider_llm = getattr(get_llm_provider(), "langchain_llm", None) - if provider_llm is None: - await eval_repo.update_run( - db, - run, - status="failed", - error_message="生产 LLM 未配置", - completed_at=datetime.now(timezone.utc), - ) - await db.commit() - return - - judge_llm = get_eval_judge_langchain_llm() - judge = EvalJudgeService(judge_llm) - runner = EvalCandidateRunner(provider_llm) - cfg = version.config_json if isinstance(version.config_json, dict) else None - - try: - replies, latencies = await runner.replay_utterances( - utterances, - version_config=cfg, - temperature=settings.eval_candidate_temperature, - ) - except Exception as e: - logger.exception("eval replay failed: {}", e) - await eval_repo.update_run( - db, - run, - status="failed", - error_message=str(e)[:2000], - completed_at=datetime.now(timezone.utc), - ) - await db.commit() - return - - transcript_parts: list[str] = [] - for i, u in enumerate(utterances): - if i >= len(replies): - break - transcript_parts.append( - format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i])) - ) - prior_blocks: list[str] = [] - for idx, u in enumerate(utterances): - if idx >= len(replies): - break - reply = assistant_text_for_eval_display(replies[idx]) - lat = latencies[idx] if idx < len(latencies) else None - prior = "\n\n".join(prior_blocks) - if len(prior) > 8000: - prior = prior[-8000:] - tj = await judge.judge_turn( - prior_transcript=prior, - user_utterance=u, - assistant_reply=reply, - turn_index_0=idx, - ) - scores = tj.model_dump() if tj else None - rationale = tj.rationale if tj else None - await eval_repo.add_turn( - db, - run_id=str(run.id), - turn_index=idx, - user_utterance=u, - assistant_reply=replies[idx], - duration_ms=lat, - judge_scores_json=scores, - judge_rationale=rationale, - ) - await db.commit() - prior_blocks.append(format_eval_turn_block(idx, u, reply)) - - full_transcript = "\n\n".join(transcript_parts) - conv_out = await judge.judge_conversation(full_transcript=full_transcript) - conv_total = conv_out.total_score if conv_out else None - - memoir_md = simple_memoir_from_transcript(utterances, replies) - source_transcript = _trim_evidence_text(full_transcript) - reference_memoir = (case.reference_memoir_markdown or "").strip() - synthetic_notes = ( - "本项为 replay 合成的短 memoir:证据闭包仅为重放对话 transcript(无 library artifact lineage)。" - f" turns={len(utterances)}" - ) - mem_out = await judge.judge_memoir( - memoir_markdown=memoir_md, - source_transcript=source_transcript, - structured_evidence=( - "(结构化记忆证据:自动化 replay 路径未绑定用户 memory chunk/fact/timeline/summary。)" - ), - reference_memoir_markdown=reference_memoir, - evidence_notes=synthetic_notes, - ) - - chapter_entries: list[dict[str, Any]] = [] - story_entries: list[dict[str, Any]] = [] - uid = (case.source_user_id or "").strip() - trace_svc = EvalTraceService(db) - - def _library_evidence_notes( - lineage_tier: str, - evidence_summary: str, - truncated: bool, - dropped: list[str], - ) -> str: - drops = ",".join(dropped[:12]) if dropped else "" - return ( - "library artifact 评审:以证据闭包为准;若 lineage 为 fallback 或不足须保守打分。" - f" lineage_tier={lineage_tier};summary={evidence_summary};" - f" prompt_truncated={truncated};dropped_sections={drops or 'none'}。" - " 单章节/单故事节选;跨篇上下文不足写入 insufficient_evidence。" - ) - - if uid: - from app.features.memoir.repo import get_chapters_for_memoir_list - from app.features.story.repo import get_stories_for_user - - try: - chapters = await get_chapters_for_memoir_list( - uid, db, active_only=True, is_new_only=None - ) - for ch in chapters[:_MAX_EVAL_CHAPTERS]: - body = (ch.canonical_markdown or "").strip() - if not body: - continue - md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}" - cb = await trace_svc.build_chapter_bundle(uid, ch) - formatted, cb2 = await trace_svc.format_chapter_bundle(cb) - fm = formatted.format_meta - cj = await judge.judge_memoir( - memoir_markdown=md, - source_transcript=formatted.source_transcript, - structured_evidence=formatted.structured_evidence, - reference_memoir_markdown=reference_memoir, - evidence_notes=_library_evidence_notes( - cb2.lineage_tier, - formatted.evidence_summary, - fm.truncated, - fm.dropped_sections, - ), - ) - chapter_entries.append( - { - "id": ch.id, - "title": ch.title, - "order_index": ch.order_index, - "lineage_tier": cb2.lineage_tier, - "evidence_summary": formatted.evidence_summary, - "evidence_trace": cb2.model_dump(), - "format_meta": fm.model_dump(), - "judge": cj.model_dump() if cj else None, - } - ) - except Exception as e: - logger.warning("eval chapter judges skipped: {}", e) - - try: - stories = await get_stories_for_user(db, uid, status="active") - for st in stories[:_MAX_EVAL_STORIES]: - body = (st.canonical_markdown or "").strip() - if not body: - continue - md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}" - sb = await trace_svc.build_story_bundle(uid, str(st.id)) - formatted, sb2 = await trace_svc.format_story_bundle(sb) - fm = formatted.format_meta - sj = await judge.judge_memoir( - memoir_markdown=md, - source_transcript=formatted.source_transcript, - structured_evidence=formatted.structured_evidence, - reference_memoir_markdown=reference_memoir, - evidence_notes=_library_evidence_notes( - sb2.lineage_tier, - formatted.evidence_summary, - fm.truncated, - fm.dropped_sections, - ), - ) - story_entries.append( - { - "id": st.id, - "title": st.title, - "stage": st.stage, - "lineage_tier": sb2.lineage_tier, - "evidence_summary": formatted.evidence_summary, - "evidence_trace": sb2.model_dump(), - "format_meta": fm.model_dump(), - "judge": sj.model_dump() if sj else None, - } - ) - except Exception as e: - logger.warning("eval story judges skipped: {}", e) - - synth_scores: list[float] = [] - if mem_out is not None: - synth_scores.append(float(mem_out.total_score)) - - library_scores: list[float] = [] - for row in chapter_entries: - j = row.get("judge") - if isinstance(j, dict) and j.get("total_score") is not None: - library_scores.append(float(j["total_score"])) - for row in story_entries: - j = row.get("judge") - if isinstance(j, dict) and j.get("total_score") is not None: - library_scores.append(float(j["total_score"])) - - def _mean(xs: list[float]) -> float: - return sum(xs) / len(xs) if xs else 0.0 - - if synth_scores and library_scores: - mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores) - elif synth_scores: - mem_total = _mean(synth_scores) - elif library_scores: - mem_total = _mean(library_scores) - else: - mem_total = None - - exp = await eval_repo.get_experiment(db, str(run.experiment_id)) - weights = ( - exp.composite_weights_json - if exp and isinstance(exp.composite_weights_json, dict) - else None - ) - comp = _composite(conv_total, mem_total, weights) - - bundle: dict[str, Any] = { - "conversation_judge": conv_out.model_dump() if conv_out else None, - "memoir_judge": mem_out.model_dump() if mem_out else None, - "synthetic_memoir_judge": mem_out.model_dump() if mem_out else None, - "chapters": chapter_entries, - "stories": story_entries, - "judge_meta": { - "conversation_judge_ok": conv_out is not None, - "memoir_synthetic_ok": mem_out is not None, - "memoir_synth_scores_n": len(synth_scores), - "memoir_library_scores_n": len(library_scores), - "synthetic_memoir_lineage_tier": "replay_transcript_only", - "synthetic_memoir_evidence_summary": ( - f"replay_turns={len(utterances)};structured_memory=unbound" - ), - "memoir_aggregate_rule": ( - "synthetic_memoir_judge_plus_library_memoir_judge_weighted_mean" - if synth_scores and library_scores - else ("synthetic_memoir_only" if synth_scores else "library_memoir_only") - ), - }, - } - await eval_repo.update_run( - db, - run, - status="completed", - memoir_markdown=memoir_md, - conversation_score_total=conv_total, - memoir_score_total=mem_total, - composite_score=comp, - judge_bundle_json=bundle, - completed_at=datetime.now(timezone.utc), - ) - await db.commit() - - -async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None: - runs = await eval_repo.list_runs_for_experiment(db, experiment_id) - exp = await eval_repo.get_experiment(db, experiment_id) - if not exp: - return - cases = await eval_repo.list_cases(db, str(exp.regression_set_id)) - - incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")] - if incomplete: - return - - failed = [r for r in runs if str(r.status) == "failed"] - if failed: - await eval_repo.update_experiment( - db, - exp, - status="failed", - error_message="部分 run 失败", - completed_at=datetime.now(timezone.utc), - ) - await db.commit() - return - - gr = compute_gate(cases=cases, runs=runs) - await eval_repo.upsert_gate_verdict( - db, - experiment_id=experiment_id, - passed=gr.passed, - mean_composite_delta=gr.mean_delta, - protected_regressions_json=gr.protected_regressions, - details_json=gate_result_to_details(gr), - ) - await eval_repo.update_experiment( - db, - exp, - status="completed", - completed_at=datetime.now(timezone.utc), - ) - await db.commit() - - -async def execute_experiment_full(experiment_id: str) -> None: - from app.core.redis_lock import acquire_redis_lock, release_redis_lock - - lock_key = f"lock:eval_experiment:{experiment_id}" - lock_handle = acquire_redis_lock(lock_key, ttl_seconds=7200) - if lock_handle is None: - logger.warning( - "eval experiment already running or lock busy experiment_id={}", - experiment_id, - ) - return - - try: - async with AsyncSessionLocal() as db: - exp = await eval_repo.get_experiment(db, experiment_id) - if not exp: - return - await eval_repo.update_experiment(db, exp, status="running") - await db.commit() - - cases = await eval_repo.list_cases(db, str(exp.regression_set_id)) - base_v = await eval_repo.get_version(db, str(exp.baseline_version_id)) - cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id)) - if base_v is None or cand_v is None: - await eval_repo.update_experiment( - db, - exp, - status="failed", - error_message="version 不存在", - completed_at=datetime.now(timezone.utc), - ) - await db.commit() - return - - for case in cases: - for side, ver in ("baseline", base_v), ("candidate", cand_v): - run = await eval_repo.get_run(db, experiment_id, str(case.id), side) - if not run: - run = await eval_repo.create_run( - db, - experiment_id=experiment_id, - case_id=str(case.id), - side=side, - ) - await db.commit() - await execute_eval_run(db, run=run, case=case, version=ver) - - await _finalize_experiment_gate(db, experiment_id) - finally: - release_redis_lock(lock_handle) diff --git a/api/app/features/evaluation/judge_manual_service.py b/api/app/features/evaluation/judge_manual_service.py index 98865a4..0af17c4 100644 --- a/api/app/features/evaluation/judge_manual_service.py +++ b/api/app/features/evaluation/judge_manual_service.py @@ -1,7 +1,8 @@ -"""手动触发 GLM-5 评审(不写 eval_runs;Playground 对话评分写入 conversations 表)。""" +"""手动触发评测台评审(智谱 / DeepSeek;不写 eval_runs;Playground 对话评分写入 conversations 表)。""" from __future__ import annotations +import copy import re from collections.abc import AsyncIterator from datetime import datetime, timezone @@ -9,15 +10,26 @@ from typing import Any from sqlalchemy.ext.asyncio import AsyncSession -from app.core.dependencies import get_eval_judge_langchain_llm +from app.core.dependencies import ( + EvalJudgeProvider, + build_eval_judge_llm_spec, +) from app.core.logging import get_logger from app.features.conversation import repo as conversation_repo from app.features.evaluation.errors import ( EvaluationBadRequestError, EvaluationNotFoundError, ) +from app.features.evaluation.conversation_compare_summary import ( + build_conversation_compare_summary, +) from app.features.evaluation.eval_trace_service import EvalTraceService -from app.features.evaluation.judge_service import EvalJudgeService +from app.features.evaluation.judge_schemas import ConversationJudgeOutput +from app.features.evaluation.judge_service import ( + EvalJudgeService, + eval_judge_compare_transcript_each_max_chars_for_context, + eval_judge_conversation_transcript_max_chars_for_context, +) from app.features.evaluation.schemas import MemoirSectionBaselineOut from app.features.evaluation.session_catalog_service import SessionCatalogService from app.features.evaluation.transcript_for_judge import ( @@ -38,6 +50,43 @@ _MAX_EVAL_CHAPTERS = 30 _MAX_EVAL_STORIES = 40 _PRIOR_TRANSCRIPT_MAX_CHARS = 8000 +_JUDGE_CONFIG_HINT = ( + "评审未配置:智谱需 eval_judge_api_key 或 zhipu_api_key;" + "DeepSeek 需 deepseek_api_key(或 llm_api_key)" +) + + +def _make_eval_judge( + judge_provider: EvalJudgeProvider, + judge_model: str | None, +) -> tuple[EvalJudgeService | None, str]: + spec = build_eval_judge_llm_spec(judge_provider, judge_model) + if not spec or not spec.llm: + return None, "" + return ( + EvalJudgeService( + spec.llm, + context_window_tokens=spec.context_window_tokens, + ), + spec.resolved_model, + ) + + +def _strip_baseline_judge_errors(errs: list[Any]) -> list[str]: + out: list[str] = [] + for e in errs: + s = str(e) if e is not None else "" + if not s.strip(): + continue + if ( + "基准整体打分失败" in s + or s.startswith("baseline_glm5:") + or "baseline_glm5_failed:" in s + ): + continue + out.append(s) + return out + async def _iter_turn_judgments_for_turns( judge: EvalJudgeService, @@ -126,6 +175,9 @@ class EvalJudgeManualService: self, conversation_id: str, fixture_filename: str | None, + *, + judge_provider: EvalJudgeProvider = "zhipu", + judge_model: str | None = None, ) -> dict[str, Any]: cid = (conversation_id or "").strip() if not cid: @@ -154,8 +206,9 @@ class EvalJudgeManualService: raise EvaluationNotFoundError("fixture not found") from e errors: list[str] = [] - judge_llm = get_eval_judge_langchain_llm() - judge = EvalJudgeService(judge_llm) + judge, resolved_model = _make_eval_judge(judge_provider, judge_model) + if not judge: + raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT) baseline_judge_dict: dict[str, Any] | None = None if baseline_transcript.strip(): baseline_result = await judge.judge_conversation_result( @@ -189,12 +242,27 @@ class EvalJudgeManualService: "replay_judge": replay_judge_dict, "baseline_turn_judges": {}, "replay_turn_judges": {}, + "compare_summary": build_conversation_compare_summary( + baseline_judge=bj, + replay_judge=rj, + baseline_transcript=baseline_transcript, + replay_transcript=replay_transcript, + conv_cap=eval_judge_conversation_transcript_max_chars_for_context( + judge._ctx_tokens + ), + compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context( + judge._ctx_tokens + ), + fixture_filename=fn, + ), "compare_markdown": "", "errors": list(errors), "warnings": [], "options": { "include_turn_judges": False, "include_baseline_turn_judges": False, + "judge_provider": judge_provider, + "judge_model": resolved_model, }, } await self._persist_playground_conversation_judge(cid, bundle) @@ -206,6 +274,7 @@ class EvalJudgeManualService: "replay_transcript": replay_transcript, "baseline_judge": baseline_judge_dict, "replay_judge": replay_judge_dict, + "compare_summary": bundle.get("compare_summary"), "errors": errors, } @@ -216,6 +285,8 @@ class EvalJudgeManualService: *, include_turn_judges: bool = False, include_baseline_turn_judges: bool = False, + judge_provider: EvalJudgeProvider = "zhipu", + judge_model: str | None = None, ) -> AsyncIterator[dict[str, Any]]: """供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议;成功后写入 playground 字段。""" acc: dict[str, Any] = { @@ -225,12 +296,15 @@ class EvalJudgeManualService: "replay_judge": None, "baseline_turn_judges": {}, "replay_turn_judges": {}, + "compare_summary": None, "compare_markdown": "", "errors": [], "warnings": [], "options": { "include_turn_judges": include_turn_judges, "include_baseline_turn_judges": include_baseline_turn_judges, + "judge_provider": judge_provider, + "judge_model": "", }, } cid = (conversation_id or "").strip() @@ -278,20 +352,26 @@ class EvalJudgeManualService: } return - judge_llm = get_eval_judge_langchain_llm() - if not judge_llm: + judge, resolved_model = _make_eval_judge(judge_provider, judge_model) + if not judge: yield { "event": "error", "phase": "config", - "message": "评审 LLM 未配置(eval_judge_api_key / zhipu_api_key)", + "message": _JUDGE_CONFIG_HINT, } return + acc["options"]["judge_model"] = resolved_model acc["fixture_filename"] = fn persist = True try: - judge = EvalJudgeService(judge_llm) - yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn} + yield { + "event": "meta", + "conversation_id": cid, + "fixture_filename": fn, + "judge_provider": judge_provider, + "judge_model": resolved_model, + } if not baseline_transcript.strip(): wmsg = "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议" @@ -358,16 +438,30 @@ class EvalJudgeManualService: acc["replay_judge"] = ( replay_judge.model_dump() if replay_judge else None ) + acc["compare_summary"] = build_conversation_compare_summary( + baseline_judge=baseline_judge, + replay_judge=replay_judge, + baseline_transcript=baseline_transcript, + replay_transcript=replay_transcript, + conv_cap=eval_judge_conversation_transcript_max_chars_for_context( + judge._ctx_tokens + ), + compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context( + judge._ctx_tokens + ), + fixture_filename=fn, + ) yield { "event": "replay_judge", "ok": replay_judge is not None, "judge": acc["replay_judge"], } + yield {"event": "compare_summary", "summary": acc["compare_summary"]} if not replay_judge: err = ( - f"回放对话整体 GLM-5 打分失败:{replay_result.error}" + f"回放对话整体打分失败:{replay_result.error}" if replay_result.error - else "回放对话整体 GLM-5 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)" + else "回放对话整体打分失败(限流或 JSON 解析失败,见服务端日志)" ) acc["errors"].append(err) yield { @@ -411,17 +505,179 @@ class EvalJudgeManualService: acc["judged_at"] = datetime.now(timezone.utc).isoformat() await self._persist_playground_conversation_judge(cid, acc) + async def retry_baseline_conversation_judge( + self, + conversation_id: str, + fixture_filename: str | None, + *, + include_baseline_turn_judges: bool = False, + judge_provider: EvalJudgeProvider = "zhipu", + judge_model: str | None = None, + ) -> dict[str, Any]: + """仅重试导出基线整体 GLM 分(及可选基线逐轮),并基于已有 replay 分重新生成对比稿。""" + cid = (conversation_id or "").strip() + if not cid: + raise EvaluationBadRequestError("conversation_id is required") + + catalog = SessionCatalogService(self._db) + dialogue = await catalog.get_session_dialogue(cid) + if not dialogue: + raise EvaluationNotFoundError("conversation not found") + + replay_transcript = format_session_messages_with_turn_labels( + list(dialogue.messages) + ) + if not replay_transcript.strip(): + raise EvaluationBadRequestError("no messages to judge") + + fn = (fixture_filename or "").strip() or None + if not fn: + raise EvaluationBadRequestError("请选择基线 MD(fixture_filename)后再重试基准分") + + try: + turns, _ = read_user_export_fixture(fn) + export_turns = list(turns) + baseline_transcript = format_export_turns_with_labels(turns) + except ValueError as e: + raise EvaluationBadRequestError(str(e)) from e + except FileNotFoundError: + raise EvaluationNotFoundError("fixture not found") from None + + if not baseline_transcript.strip(): + raise EvaluationBadRequestError("baseline transcript is empty") + + prev = await catalog.get_playground_conversation_judge_json(cid) + if not prev or not isinstance(prev, dict): + raise EvaluationBadRequestError( + "服务端没有已保存的评分草稿:请先跑一次「自动评分(流式)」" + "直到回放侧打分完成,再使用本重试。" + ) + raw_replay = prev.get("replay_judge") + if not raw_replay or not isinstance(raw_replay, dict): + raise EvaluationBadRequestError( + "已保存结果中缺少回放侧整体分:请先完成流式评分中的回放打分阶段再重试基准。" + ) + + try: + replay_model = ConversationJudgeOutput.model_validate(raw_replay) + except Exception as e: + raise EvaluationBadRequestError( + "已保存的回放评分格式无效,请重新跑一次完整流式评分。" + ) from e + + judge, resolved_model = _make_eval_judge(judge_provider, judge_model) + if not judge: + raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT) + baseline_result = await judge.judge_conversation_result( + full_transcript=baseline_transcript + ) + if not baseline_result.output: + err = baseline_result.error or "unknown error" + msg = f"基准整体打分失败:{err}" + errs = _strip_baseline_judge_errors(list(prev.get("errors") or [])) + errs.append(msg) + return { + "ok": False, + "error": err, + "message": msg, + "baseline_judge": None, + "replay_judge": raw_replay, + "compare_summary": build_conversation_compare_summary( + baseline_judge=None, + replay_judge=replay_model, + baseline_transcript=baseline_transcript, + replay_transcript=replay_transcript, + conv_cap=eval_judge_conversation_transcript_max_chars_for_context( + judge._ctx_tokens + ), + compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context( + judge._ctx_tokens + ), + fixture_filename=fn, + ), + "compare_markdown": "", + "baseline_turn_judges": {}, + "errors": errs, + } + + baseline_judge = baseline_result.output + acc: dict[str, Any] = copy.deepcopy(prev) + acc.setdefault("version", 1) + acc["baseline_judge"] = baseline_judge.model_dump() + acc["fixture_filename"] = fn + acc["errors"] = _strip_baseline_judge_errors(list(acc.get("errors") or [])) + opts = acc.setdefault("options", {}) + if isinstance(opts, dict): + opts["judge_provider"] = judge_provider + opts["judge_model"] = resolved_model + + if include_baseline_turn_judges and export_turns: + acc["baseline_turn_judges"] = {} + async for row in _iter_turn_judgments_for_turns( + judge, + export_turns, + sse_event="baseline_turn_judge", + ): + idx = row.get("turn_index") + if ( + isinstance(idx, (int, float)) + and row.get("judge") is not None + ): + acc["baseline_turn_judges"][str(int(idx))] = row["judge"] + + acc["compare_markdown"] = "" + acc["compare_summary"] = build_conversation_compare_summary( + baseline_judge=baseline_judge, + replay_judge=replay_model, + baseline_transcript=baseline_transcript, + replay_transcript=replay_transcript, + conv_cap=eval_judge_conversation_transcript_max_chars_for_context( + judge._ctx_tokens + ), + compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context( + judge._ctx_tokens + ), + fixture_filename=fn, + ) + async for piece in judge.stream_conversation_compare( + baseline_transcript=baseline_transcript, + replay_transcript=replay_transcript, + baseline_judge=baseline_judge, + replay_judge=replay_model, + ): + if piece: + acc["compare_markdown"] += piece + + acc["judged_at"] = datetime.now(timezone.utc).isoformat() + await self._persist_playground_conversation_judge(cid, acc) + + return { + "ok": True, + "error": None, + "message": None, + "baseline_judge": acc["baseline_judge"], + "replay_judge": acc.get("replay_judge"), + "compare_summary": acc.get("compare_summary"), + "compare_markdown": acc.get("compare_markdown") or "", + "baseline_turn_judges": acc.get("baseline_turn_judges") or {}, + "errors": acc["errors"], + } + async def judge_memoir_for_user( self, user_id: str, baseline_sections: list[MemoirSectionBaselineOut] | None, + *, + judge_provider: EvalJudgeProvider = "zhipu", + judge_model: str | None = None, ) -> dict[str, Any]: uid = (user_id or "").strip() if not uid: raise EvaluationBadRequestError("user_id is required") - judge_llm = get_eval_judge_langchain_llm() - judge = EvalJudgeService(judge_llm) + judge, _resolved = _make_eval_judge(judge_provider, judge_model) + if not judge: + raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT) baselines = list(baseline_sections or []) trace_svc = EvalTraceService(self._db) diff --git a/api/app/features/evaluation/judge_service.py b/api/app/features/evaluation/judge_service.py index be0869f..bc03aab 100644 --- a/api/app/features/evaluation/judge_service.py +++ b/api/app/features/evaluation/judge_service.py @@ -1,4 +1,4 @@ -"""智谱 GLM-5 评审调用(结构化 JSON)。""" +"""评测台评审:智谱 / DeepSeek 等 OpenAI 兼容端点(结构化 JSON)。""" from __future__ import annotations @@ -36,10 +36,10 @@ _COMPARE_STREAM_MAX = 6144 _MEMOIR_EVIDENCE_MAX = 12000 -def _eval_judge_prompt_char_pool() -> int: +def _eval_judge_prompt_char_pool_for_context(context_window_tokens: int) -> int: """整段请求的字符预算(由评审模型 context window 推导,保守)。""" toks = ( - settings.eval_judge_context_window_tokens + int(context_window_tokens) - settings.eval_judge_completion_reserve_tokens - settings.eval_judge_prompt_budget_safety_tokens ) @@ -47,27 +47,68 @@ def _eval_judge_prompt_char_pool() -> int: return max(1, int(toks / settings.eval_judge_approx_tokens_per_char)) +def _eval_judge_prompt_char_pool() -> int: + return _eval_judge_prompt_char_pool_for_context( + settings.eval_judge_context_window_tokens + ) + + def eval_judge_conversation_transcript_max_chars() -> int: - """整段对话评审:【完整对话】transcript 最大字符数。""" + """整段对话评审:【完整对话】transcript 最大字符数(默认 GLM 上下文)。""" if settings.eval_judge_max_transcript_chars > 0: return settings.eval_judge_max_transcript_chars overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32 return max(1, _eval_judge_prompt_char_pool() - overhead) +def eval_judge_conversation_transcript_max_chars_for_context( + context_window_tokens: int, +) -> int: + if settings.eval_judge_max_transcript_chars > 0: + return settings.eval_judge_max_transcript_chars + overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32 + pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens) + return max(1, pool - overhead) + + def eval_judge_turn_prior_transcript_max_chars() -> int: - """逐轮评审:截至上一轮的 transcript 节选上限(含与用户/助手正文头的固定开销)。""" + """逐轮评审:截至上一轮的 transcript 节选上限(默认 GLM 上下文)。""" if settings.eval_judge_max_transcript_chars > 0: return settings.eval_judge_max_transcript_chars static = len(TURN_JUDGE_INSTRUCTIONS) + 8800 return max(1, _eval_judge_prompt_char_pool() - static) +def eval_judge_turn_prior_transcript_max_chars_for_context( + context_window_tokens: int, +) -> int: + if settings.eval_judge_max_transcript_chars > 0: + return settings.eval_judge_max_transcript_chars + static = len(TURN_JUDGE_INSTRUCTIONS) + 8800 + pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens) + return max(1, pool - static) + + def eval_judge_compare_transcript_each_max_chars() -> int: - """A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(均分字符预算)。""" + """A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(默认 GLM 上下文)。""" if settings.eval_judge_max_compare_transcript_chars_each > 0: return settings.eval_judge_max_compare_transcript_chars_each - pool = _eval_judge_prompt_char_pool() - settings.eval_judge_compare_prompt_overhead_chars + pool = ( + _eval_judge_prompt_char_pool() + - settings.eval_judge_compare_prompt_overhead_chars + ) + return max(1, pool // 2) + + +def eval_judge_compare_transcript_each_max_chars_for_context( + context_window_tokens: int, +) -> int: + if settings.eval_judge_max_compare_transcript_chars_each > 0: + return settings.eval_judge_max_compare_transcript_chars_each + pool = ( + _eval_judge_prompt_char_pool_for_context(context_window_tokens) + - settings.eval_judge_compare_prompt_overhead_chars + ) return max(1, pool // 2) @@ -140,8 +181,31 @@ def _build_memoir_judge_prompt( class EvalJudgeService: - def __init__(self, judge_llm: Any | None) -> None: + def __init__( + self, + judge_llm: Any | None, + *, + context_window_tokens: int | None = None, + ) -> None: self._llm = judge_llm + self._ctx_tokens = int( + context_window_tokens or settings.eval_judge_context_window_tokens + ) + + def _conv_transcript_cap(self) -> int: + return eval_judge_conversation_transcript_max_chars_for_context( + self._ctx_tokens + ) + + def _turn_prior_cap(self) -> int: + return eval_judge_turn_prior_transcript_max_chars_for_context( + self._ctx_tokens + ) + + def _compare_each_cap(self) -> int: + return eval_judge_compare_transcript_each_max_chars_for_context( + self._ctx_tokens + ) async def judge_turn( self, @@ -159,7 +223,7 @@ class EvalJudgeService: 【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致)。evidence_refs.turn_index 请使用该编号。 【截至上一轮的对话节选】(含 `[Turn k]` 标签) -{prior_transcript[: eval_judge_turn_prior_transcript_max_chars()]} +{prior_transcript[: self._turn_prior_cap()]} 【本轮用户】 {user_utterance[:4000]} @@ -183,11 +247,14 @@ class EvalJudgeService: self, *, full_transcript: str ) -> JudgeCallResult[ConversationJudgeOutput]: if not self._llm: - return JudgeCallResult(output=None, error="评审模型未配置") + return JudgeCallResult( + output=None, + error="评审模型未配置(智谱或 DeepSeek 密钥)", + ) prompt = f"""{CONV_JUDGE_INSTRUCTIONS} 【完整对话】(每轮以 `[Turn k]` 开头) -{full_transcript[: eval_judge_conversation_transcript_max_chars()]} +{full_transcript[: self._conv_transcript_cap()]} """ try: out = await allm_json_call( @@ -219,10 +286,10 @@ class EvalJudgeService: ) -> AsyncIterator[str]: """流式输出中文对比与建议(非 JSON)。""" if not self._llm: - yield "[错误] 未配置评审模型 API Key(eval_judge_api_key / zhipu_api_key)" + yield "[错误] 未配置评审模型 API Key(智谱:eval_judge_api_key / zhipu_api_key;DeepSeek:deepseek_api_key)" return - cap_each = eval_judge_compare_transcript_each_max_chars() - cap_single = eval_judge_conversation_transcript_max_chars() + cap_each = self._compare_each_cap() + cap_single = self._conv_transcript_cap() b_tr = (baseline_transcript or "").strip()[:cap_each] r_tr = (replay_transcript or "").strip()[:cap_each] b_json = ( @@ -308,7 +375,10 @@ class EvalJudgeService: evidence_notes: str = "", ) -> JudgeCallResult[MemoirJudgeOutput]: if not self._llm: - return JudgeCallResult(output=None, error="评审模型未配置") + return JudgeCallResult( + output=None, + error="评审模型未配置(智谱或 DeepSeek 密钥)", + ) prompt = _build_memoir_judge_prompt( memoir_markdown=memoir_markdown, source_transcript=source_transcript, diff --git a/api/app/features/evaluation/memoir_readiness_service.py b/api/app/features/evaluation/memoir_readiness_service.py index cd2cd89..87d2927 100644 --- a/api/app/features/evaluation/memoir_readiness_service.py +++ b/api/app/features/evaluation/memoir_readiness_service.py @@ -6,11 +6,12 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.features.conversation.models import Conversation, Segment +from app.features.conversation.ws.pipeline import background_runner from app.features.evaluation.errors import ( EvaluationBadRequestError, EvaluationNotFoundError, ) -from app.features.evaluation.schemas import MemoirPhase1ReadyOut +from app.features.evaluation.schemas import MemoirPhase1ReadyOut, MemoirSubmitOut class MemoirReadinessService: @@ -55,3 +56,44 @@ class MemoirReadinessService: checked_segment_ids=ids, pending_segment_ids=pending, ) + + async def submit_memoir_phase1_for_conversation( + self, + *, + conversation_id: str, + ) -> MemoirSubmitOut: + """本会话内尚待 Phase1 的 segment 合并提交 Celery(与对话结束 flush 语义对齐)。""" + cid = (conversation_id or "").strip() + if not cid: + raise EvaluationBadRequestError("conversation_id is required") + conv = await self._db.get(Conversation, cid) + if not conv or conv.deleted_at is not None: + raise EvaluationNotFoundError("conversation not found") + uid = str(conv.user_id) + stmt = ( + select(Segment.id) + .where( + Segment.conversation_id == cid, + Segment.processed.is_(False), + Segment.topic_category.is_(None), + ) + .order_by(Segment.created_at.asc()) + ) + result = await self._db.execute(stmt) + segment_ids = [str(i) for i in result.scalars().all()] + if not segment_ids: + return MemoirSubmitOut( + conversation_id=cid, + user_id=uid, + segment_ids=[], + celery_task_id=None, + ) + task_id = await background_runner.flush_pending( + uid, extra_segment_ids=segment_ids + ) + return MemoirSubmitOut( + conversation_id=cid, + user_id=uid, + segment_ids=segment_ids, + celery_task_id=task_id, + ) diff --git a/api/app/features/evaluation/presenters.py b/api/app/features/evaluation/presenters.py deleted file mode 100644 index 7a655b4..0000000 --- a/api/app/features/evaluation/presenters.py +++ /dev/null @@ -1,26 +0,0 @@ -"""ORM → API schema 映射(供 REST / SSE 共用)。""" - -from __future__ import annotations - -from app.features.evaluation.schemas import CaseOut, EvalRunOut, RunTurnOut - - -def case_out(row) -> CaseOut: - return CaseOut.model_validate(row) - - -def run_out(row, turns: list) -> EvalRunOut: - return EvalRunOut( - id=row.id, - experiment_id=row.experiment_id, - case_id=row.case_id, - side=row.side, - status=row.status, - error_message=row.error_message, - memoir_markdown=row.memoir_markdown, - conversation_score_total=row.conversation_score_total, - memoir_score_total=row.memoir_score_total, - composite_score=row.composite_score, - judge_bundle_json=row.judge_bundle_json, - turns=[RunTurnOut.model_validate(t) for t in turns], - ) diff --git a/api/app/features/evaluation/replay_service.py b/api/app/features/evaluation/replay_service.py index 3e0874e..47f2dcd 100644 --- a/api/app/features/evaluation/replay_service.py +++ b/api/app/features/evaluation/replay_service.py @@ -97,6 +97,7 @@ class ReplayConversationService: conversation_id: str, fixture_filename: str, flush_memoir_after: bool, + skip_memoir: bool, skip_tts: bool, ) -> tuple[int, list[str], list[str]]: try: @@ -112,6 +113,7 @@ class ReplayConversationService: conversation_id=conversation_id, utterances=utterances, flush_memoir_after=flush_memoir_after, + skip_memoir=skip_memoir, skip_tts=skip_tts, ) return n, utterances, segment_ids @@ -122,6 +124,7 @@ class ReplayConversationService: conversation_id: str, utterances: list[str], flush_memoir_after: bool, + skip_memoir: bool, skip_tts: bool, ) -> tuple[int, list[str]]: cid = (conversation_id or "").strip() @@ -144,11 +147,12 @@ class ReplayConversationService: segment = await conv_service.create_user_segment(conv, conv.user_id, text) segment_ids.append(segment.id) ts = segment.created_at or conv.last_message_at - await background_runner.queue_message( - conv.user_id, - segment.id, - text_char_count=len(text), - ) + if not skip_memoir: + await background_runner.queue_message( + conv.user_id, + segment.id, + text_char_count=len(text), + ) await process_user_message( conversation_id=cid, user_message=text, @@ -161,14 +165,19 @@ class ReplayConversationService: ) count += 1 - if flush_memoir_after and conv.user_id: + if ( + flush_memoir_after + and conv.user_id + and (not skip_memoir) + ): await background_runner.flush_pending(conv.user_id) logger.info( - "eval replay done conversation_id={} turns={} flush={} skip_tts={}", + "eval replay done conversation_id={} turns={} flush={} skip_memoir={} skip_tts={}", cid, count, flush_memoir_after, + skip_memoir, skip_tts, ) return count, segment_ids diff --git a/api/app/features/evaluation/router.py b/api/app/features/evaluation/router.py index 7bb7ee7..0f37c42 100644 --- a/api/app/features/evaluation/router.py +++ b/api/app/features/evaluation/router.py @@ -28,45 +28,32 @@ from app.features.evaluation.importers.user_export_markdown import ( from app.features.evaluation.internal_auth import InternalEvalAuth from app.features.evaluation.judge_manual_service import EvalJudgeManualService from app.features.evaluation.memoir_readiness_service import MemoirReadinessService -from app.features.evaluation.presenters import case_out, run_out from app.features.evaluation.replay_service import ReplayConversationService from app.features.evaluation.schemas import ( - CaseCreate, - CaseOut, - EvalRunOut, EvalSandboxOut, - ExperimentCreate, - ExperimentDetailOut, - ExperimentOut, - GateVerdictOut, - ImportJsonCaseBody, - ImportMarkdownBody, ManualJudgeConversationBody, ManualJudgeConversationOut, ManualJudgeConversationStreamBody, ManualJudgeMemoirBody, ManualJudgeMemoirOut, - PlaygroundConversationJudgeOut, MemoirPhase1ReadyOut, MemoirSectionBaselineOut, - RegressionSetCreate, - RegressionSetOut, + MemoirSubmitOut, + PlaygroundConversationJudgeOut, ReplayBootstrapBody, ReplayBootstrapOut, ReplayConversationBody, ReplayConversationOut, + RetryBaselineJudgeBody, + RetryBaselineJudgeOut, SessionDialogueOut, - SessionEvalRunsOut, SessionListItem, SessionListResponse, SessionTranscriptOut, - SnapshotFromConversationBody, UserExportFixtureDetailOut, UserExportFixtureListOut, UserExportFixtureTurnOut, UserMemoirSnapshotOut, - VersionCreate, - VersionOut, ) from app.features.evaluation.session_catalog_service import SessionCatalogService from app.features.evaluation.user_export_fixtures import read_user_export_fixture @@ -88,73 +75,6 @@ def _eval_http_exc( return HTTPException(status_code=400, detail=e.detail) -@router.get("/regression-sets", response_model=list[RegressionSetOut]) -async def list_regression_sets( - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - rows = await svc.list_regression_sets() - return [RegressionSetOut.model_validate(r) for r in rows] - - -@router.post("/regression-sets", response_model=RegressionSetOut) -async def create_regression_set( - body: RegressionSetCreate, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - row = await svc.create_regression_set(body) - except (EvaluationNotFoundError, EvaluationBadRequestError) as e: - raise _eval_http_exc(e) from e - return RegressionSetOut.model_validate(row) - - -@router.get("/regression-sets/{set_id}/cases", response_model=list[CaseOut]) -async def list_cases( - set_id: str, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - rows = await svc.list_cases(set_id) - except EvaluationNotFoundError as e: - raise _eval_http_exc(e) from e - return [case_out(r) for r in rows] - - -@router.post("/regression-sets/{set_id}/cases", response_model=CaseOut) -async def create_case( - set_id: str, - body: CaseCreate, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - row = await svc.create_case(set_id, body) - except (EvaluationNotFoundError, EvaluationBadRequestError) as e: - raise _eval_http_exc(e) from e - return case_out(row) - - -@router.post( - "/regression-sets/{set_id}/snapshot-from-conversation/{conversation_id}", - response_model=CaseOut, -) -async def snapshot_from_conversation( - set_id: str, - conversation_id: str, - body: SnapshotFromConversationBody, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - row = await svc.snapshot_from_conversation(set_id, conversation_id, body) - except (EvaluationNotFoundError, EvaluationBadRequestError) as e: - raise _eval_http_exc(e) from e - return case_out(row) - - @router.get("/sessions", response_model=SessionListResponse) async def list_sessions( _auth: InternalEvalAuth, @@ -275,16 +195,25 @@ async def memoir_phase1_ready( raise _eval_http_exc(e) from e -@router.get( - "/sessions/{conversation_id}/evaluation-runs", - response_model=SessionEvalRunsOut, +@router.post( + "/sessions/{conversation_id}/memoir-submit", + response_model=MemoirSubmitOut, ) -async def list_session_evaluation_runs( +async def memoir_submit_phase1( conversation_id: str, _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], + svc: Annotated[ + MemoirReadinessService, Depends(get_memoir_readiness_service) + ], ): - return await svc.list_session_evaluation_runs(conversation_id) + try: + return await svc.submit_memoir_phase1_for_conversation( + conversation_id=conversation_id, + ) + except EvaluationNotFoundError as e: + raise _eval_http_exc(e) from e + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e @router.post("/sessions/replay-bootstrap", response_model=ReplayBootstrapOut) @@ -342,6 +271,7 @@ async def replay_conversation( conversation_id=body.conversation_id, fixture_filename=fn, flush_memoir_after=body.flush_memoir_after, + skip_memoir=body.skip_memoir, skip_tts=body.skip_tts, ) elif body.user_utterances is not None: @@ -352,6 +282,7 @@ async def replay_conversation( conversation_id=body.conversation_id, utterances=utt, flush_memoir_after=body.flush_memoir_after, + skip_memoir=body.skip_memoir, skip_tts=body.skip_tts, ) echo = utt @@ -383,6 +314,8 @@ async def judge_conversation_manual( payload = await judge_svc.judge_conversation( body.conversation_id, body.fixture_filename, + judge_provider=body.judge_provider, + judge_model=body.judge_model, ) except EvaluationNotFoundError as e: raise _eval_http_exc(e) from e @@ -406,6 +339,8 @@ async def judge_conversation_manual_stream( body.fixture_filename, include_turn_judges=body.include_turn_judges, include_baseline_turn_judges=body.include_baseline_turn_judges, + judge_provider=body.judge_provider, + judge_model=body.judge_model, ): yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n" except Exception as e: @@ -427,6 +362,32 @@ async def judge_conversation_manual_stream( ) +@router.post( + "/judge/conversation-retry-baseline", + response_model=RetryBaselineJudgeOut, +) +async def retry_baseline_conversation_judge( + body: RetryBaselineJudgeBody, + _auth: InternalEvalAuth, + judge_svc: Annotated[ + EvalJudgeManualService, Depends(get_eval_judge_manual_service) + ], +): + try: + payload = await judge_svc.retry_baseline_conversation_judge( + body.conversation_id, + body.fixture_filename, + include_baseline_turn_judges=body.include_baseline_turn_judges, + judge_provider=body.judge_provider, + judge_model=body.judge_model, + ) + except EvaluationNotFoundError as e: + raise _eval_http_exc(e) from e + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e + return RetryBaselineJudgeOut.model_validate(payload) + + @router.post("/judge/memoir-chapters", response_model=ManualJudgeMemoirOut) async def judge_memoir_chapters_manual( body: ManualJudgeMemoirBody, @@ -439,6 +400,8 @@ async def judge_memoir_chapters_manual( payload = await judge_svc.judge_memoir_for_user( body.user_id, body.baseline_sections, + judge_provider=body.judge_provider, + judge_model=body.judge_model, ) except EvaluationBadRequestError as e: raise _eval_http_exc(e) from e @@ -496,107 +459,3 @@ async def get_user_export_fixture( MemoirSectionBaselineOut(title=t, body=b) for t, b in memoir_tuples ], ) - - -@router.post("/regression-sets/{set_id}/import-markdown", response_model=CaseOut) -async def import_markdown_case( - set_id: str, - body: ImportMarkdownBody, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - row = await svc.import_markdown_case(set_id, body) - except (EvaluationNotFoundError, EvaluationBadRequestError) as e: - raise _eval_http_exc(e) from e - return case_out(row) - - -@router.post("/import/json-case", response_model=CaseOut) -async def import_json_case( - body: ImportJsonCaseBody, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - row = await svc.import_json_case(body) - except (EvaluationNotFoundError, EvaluationBadRequestError) as e: - raise _eval_http_exc(e) from e - return case_out(row) - - -@router.get("/versions", response_model=list[VersionOut]) -async def list_versions( - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - rows = await svc.list_versions() - return [VersionOut.model_validate(r) for r in rows] - - -@router.post("/versions", response_model=VersionOut) -async def create_version( - body: VersionCreate, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - row = await svc.create_version(body) - except (EvaluationNotFoundError, EvaluationBadRequestError) as e: - raise _eval_http_exc(e) from e - return VersionOut.model_validate(row) - - -@router.get("/experiments", response_model=list[ExperimentOut]) -async def list_experiments( - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], - limit: int = Query(50, ge=1, le=200), -): - rows = await svc.list_experiments(limit=limit) - return [ExperimentOut.model_validate(r) for r in rows] - - -@router.post("/experiments", response_model=ExperimentOut) -async def create_experiment( - body: ExperimentCreate, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - row = await svc.create_experiment(body) - except (EvaluationNotFoundError, EvaluationBadRequestError) as e: - raise _eval_http_exc(e) from e - return ExperimentOut.model_validate(row) - - -@router.get("/experiments/{experiment_id}", response_model=ExperimentDetailOut) -async def get_experiment_detail( - experiment_id: str, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - bundle = await svc.get_experiment_detail(experiment_id) - except EvaluationNotFoundError as e: - raise _eval_http_exc(e) from e - run_outs: list[EvalRunOut] = [run_out(r, turns) for r, turns in bundle.run_rows] - gate = GateVerdictOut.model_validate(bundle.gate) if bundle.gate else None - return ExperimentDetailOut( - experiment=ExperimentOut.model_validate(bundle.experiment), - runs=run_outs, - gate=gate, - ) - - -@router.post("/experiments/{experiment_id}/run", response_model=ExperimentOut) -async def enqueue_experiment_run( - experiment_id: str, - _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], -): - try: - exp = await svc.enqueue_experiment_run(experiment_id) - except EvaluationNotFoundError as e: - raise _eval_http_exc(e) from e - return ExperimentOut.model_validate(exp) diff --git a/api/app/features/evaluation/schemas.py b/api/app/features/evaluation/schemas.py index 9c5685b..d8e5795 100644 --- a/api/app/features/evaluation/schemas.py +++ b/api/app/features/evaluation/schemas.py @@ -3,89 +3,11 @@ from __future__ import annotations from datetime import datetime -from typing import Any +from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field - -class RegressionSetCreate(BaseModel): - name: str - description: str | None = None - - -class RegressionSetOut(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: str - name: str - description: str | None - created_at: datetime - - -class CaseCreate(BaseModel): - title: str | None = None - user_utterances: list[str] - source_conversation_id: str | None = None - source_user_id: str | None = None - reference_memoir_markdown: str | None = None - is_protected: bool = False - meta: dict[str, Any] | None = None - - -class CaseOut(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: str - regression_set_id: str - source_conversation_id: str | None - source_user_id: str | None - title: str | None - user_utterances: list[Any] - is_protected: bool - created_at: datetime - - -class VersionCreate(BaseModel): - name: str - runner_kind: str = "llm_chat_v1" - config_json: dict[str, Any] | None = None - - -class VersionOut(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: str - name: str - runner_kind: str - config_json: dict[str, Any] | None - created_at: datetime - - -class ExperimentCreate(BaseModel): - name: str - regression_set_id: str - baseline_version_id: str - candidate_version_id: str - rubric_pack: str = "conversation_v1+memoir_v1" - composite_weights_json: dict[str, Any] | None = Field( - default=None, - description='默认 {"conversation":0.5,"memoir":0.5}', - ) - - -class ExperimentOut(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: str - name: str - regression_set_id: str - baseline_version_id: str - candidate_version_id: str - rubric_pack: str - status: str - error_message: str | None - created_at: datetime - completed_at: datetime | None +EvalJudgeProviderLiteral = Literal["zhipu", "deepseek"] class SessionDialogueMessageOut(BaseModel): @@ -167,6 +89,9 @@ class ReplayConversationBody(BaseModel): fixture_filename: str | None = None user_utterances: list[str] | None = None flush_memoir_after: bool = True + """为 True 且 skip_memoir 为 False 时,本批结束后 flush 回忆录队列。""" + skip_memoir: bool = False + """为 True 时不向回忆录防抖队列入队、不 flush(供 Playground 先只测对话)。""" skip_tts: bool = True @@ -186,10 +111,20 @@ class MemoirPhase1ReadyOut(BaseModel): pending_segment_ids: list[str] = Field(default_factory=list) +class MemoirSubmitOut(BaseModel): + conversation_id: str + user_id: str + segment_ids: list[str] = Field(default_factory=list) + celery_task_id: str | None = None + + class ManualJudgeConversationBody(BaseModel): conversation_id: str """与当前评测台选中的 MD 一致,供基准 transcript / 整体打分。""" fixture_filename: str | None = None + judge_provider: EvalJudgeProviderLiteral = "zhipu" + judge_model: str | None = None + """空则用该供应商默认模型(智谱:eval_judge_model;DeepSeek:eval_judge_deepseek_model)。""" class ManualJudgeConversationStreamBody(BaseModel): @@ -199,6 +134,29 @@ class ManualJudgeConversationStreamBody(BaseModel): """对当前会话逐轮调用评审 LLM(在整体分之后)。""" include_baseline_turn_judges: bool = False """对导出基线逐轮调用评审 LLM(需 fixture + 整体基线分成功)。""" + judge_provider: EvalJudgeProviderLiteral = "zhipu" + judge_model: str | None = None + + +class RetryBaselineJudgeBody(BaseModel): + conversation_id: str + fixture_filename: str | None = None + include_baseline_turn_judges: bool = False + """与流式评分一致:成功重试基准整体分后是否补跑基线逐轮。""" + judge_provider: EvalJudgeProviderLiteral = "zhipu" + judge_model: str | None = None + + +class RetryBaselineJudgeOut(BaseModel): + ok: bool + error: str | None = None + message: str | None = None + baseline_judge: dict[str, Any] | None = None + replay_judge: dict[str, Any] | None = None + compare_summary: dict[str, Any] | None = None + compare_markdown: str = "" + baseline_turn_judges: dict[str, Any] = Field(default_factory=dict) + errors: list[str] = Field(default_factory=list) class ManualJudgeConversationOut(BaseModel): @@ -208,6 +166,7 @@ class ManualJudgeConversationOut(BaseModel): replay_transcript: str baseline_judge: dict[str, Any] | None = None replay_judge: dict[str, Any] | None = None + compare_summary: dict[str, Any] | None = None errors: list[str] = Field(default_factory=list) @@ -221,6 +180,8 @@ class PlaygroundConversationJudgeOut(BaseModel): class ManualJudgeMemoirBody(BaseModel): user_id: str baseline_sections: list[MemoirSectionBaselineOut] | None = None + judge_provider: EvalJudgeProviderLiteral = "zhipu" + judge_model: str | None = None class ManualJudgeMemoirOut(BaseModel): @@ -248,81 +209,3 @@ class UserMemoirSnapshotOut(BaseModel): user_id: str chapters: list[MemoirChapterSnapOut] stories: list[MemoirStorySnapOut] - - -class SnapshotFromConversationBody(BaseModel): - title: str | None = None - use_messages: bool = False - is_protected: bool = False - - -class ImportMarkdownBody(BaseModel): - markdown: str - title: str | None = None - is_protected: bool = False - - -class ImportJsonCaseBody(BaseModel): - regression_set_id: str - utterances: list[str] | None = None - raw_json: dict[str, Any] | list[Any] | None = Field( - default=None, - description="与 utterances 二选一:对象含 utterances 键或根数组", - ) - title: str | None = None - is_protected: bool = False - - -class RunTurnOut(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: str - turn_index: int - user_utterance: str - assistant_reply: str | None - duration_ms: int | None - judge_scores_json: dict[str, Any] | None - judge_rationale: str | None - - -class EvalRunOut(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: str - experiment_id: str - case_id: str - side: str - status: str - error_message: str | None - memoir_markdown: str | None - conversation_score_total: float | None - memoir_score_total: float | None - composite_score: float | None - judge_bundle_json: dict[str, Any] | None = None - turns: list[RunTurnOut] = [] - - -class SessionEvalRunItem(BaseModel): - experiment_name: str - run: EvalRunOut - - -class SessionEvalRunsOut(BaseModel): - conversation_id: str - items: list[SessionEvalRunItem] - - -class GateVerdictOut(BaseModel): - model_config = ConfigDict(from_attributes=True) - - passed: bool - mean_composite_delta: float | None - protected_regressions_json: list[dict[str, Any]] | None - details_json: dict[str, Any] | None - computed_at: datetime - - -class ExperimentDetailOut(BaseModel): - experiment: ExperimentOut - runs: list[EvalRunOut] - gate: GateVerdictOut | None diff --git a/api/app/features/evaluation/stream_router.py b/api/app/features/evaluation/stream_router.py deleted file mode 100644 index 621e67a..0000000 --- a/api/app/features/evaluation/stream_router.py +++ /dev/null @@ -1,45 +0,0 @@ -"""实验进度 SSE(轮询 DB,轻量实现)。""" - -from __future__ import annotations - -import asyncio -import json - -from fastapi import APIRouter, Header, Query -from fastapi.responses import StreamingResponse - -from app.core.db import AsyncSessionLocal -from app.features.evaluation.admin_service import EvaluationAdminService -from app.features.evaluation.internal_auth import verify_internal_eval_key - -router = APIRouter(tags=["internal-evaluation-stream"]) - - -@router.get("/experiments/{experiment_id}/stream") -async def experiment_event_stream( - experiment_id: str, - key: str | None = Query( - default=None, - description="等同 X-Internal-Eval-Key,供 EventSource 使用", - ), - x_internal_eval_key: str | None = Header(default=None, alias="X-Internal-Eval-Key"), -): - verify_internal_eval_key( - header_value=x_internal_eval_key, - query_value=key, - ) - - async def event_gen(): - while True: - async with AsyncSessionLocal() as session: - svc = EvaluationAdminService(session) - payload = await svc.experiment_stream_snapshot(experiment_id) - if payload is None: - yield f"data: {json.dumps({'error': 'not_found'})}\n\n" - break - yield f"data: {json.dumps(payload, default=str)}\n\n" - if payload.get("status") in ("completed", "failed"): - break - await asyncio.sleep(1.0) - - return StreamingResponse(event_gen(), media_type="text/event-stream") diff --git a/api/app/features/memoir/state_service.py b/api/app/features/memoir/state_service.py index 331e86e..6f06696 100644 --- a/api/app/features/memoir/state_service.py +++ b/api/app/features/memoir/state_service.py @@ -4,7 +4,7 @@ """ import uuid -from typing import Dict, List +from typing import Dict, List, cast from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -14,10 +14,18 @@ from app.agents.stage_constants import ( chat_bucket, normalize_chat_stage, ) -from app.agents.state_schema import MemoirStateSchema, SlotData, default_state +from app.agents.state_schema import ( + KnownFact, + MemoirStateSchema, + PersonaThread, + SlotData, + default_state, +) from app.core.config import settings from app.features.memoir.models import MemoirState as MemoirStateModel +_INTERVIEW_STATE_META_KEY = "__interview_state__" + def _slots_snapshot_for_merge(raw: Dict[str, Dict] | None) -> Dict[str, Dict]: """浅拷贝 slots,避免就地改 JSON 列同一 dict 引用导致 ORM 不标记 dirty。""" @@ -26,15 +34,67 @@ def _slots_snapshot_for_merge(raw: Dict[str, Dict] | None) -> Dict[str, Dict]: return {k: dict(v or {}) for k, v in raw.items()} +def _extract_interview_state_meta( + raw_slots: Dict[str, Dict] | None, +) -> tuple[list[KnownFact], list[PersonaThread], list[str]]: + if not raw_slots or not isinstance(raw_slots, dict): + return [], [], [] + meta = raw_slots.get(_INTERVIEW_STATE_META_KEY) + if not isinstance(meta, dict): + return [], [], [] + known = meta.get("known_facts") if isinstance(meta.get("known_facts"), list) else [] + persona = ( + meta.get("persona_threads") + if isinstance(meta.get("persona_threads"), list) + else [] + ) + recent = ( + meta.get("recent_questions") + if isinstance(meta.get("recent_questions"), list) + else [] + ) + return ( + [KnownFact.model_validate(x) for x in known if isinstance(x, dict)], + [PersonaThread.model_validate(x) for x in persona if isinstance(x, dict)], + [str(x).strip() for x in recent if str(x).strip()], + ) + + +def _inject_interview_state_meta( + *, + slots: Dict[str, Dict], + known_facts: list[KnownFact], + persona_threads: list[PersonaThread], + recent_questions: list[str], +) -> Dict[str, Dict]: + out = dict(slots) + out[_INTERVIEW_STATE_META_KEY] = cast( + Dict, + { + "known_facts": [x.model_dump() for x in known_facts], + "persona_threads": [x.model_dump() for x in persona_threads], + "recent_questions": list(recent_questions), + }, + ) + return out + + def coerce_memoir_state(model: MemoirStateModel) -> MemoirStateSchema: + raw_slots = model.slots if isinstance(model.slots, dict) else None + known_facts, persona_threads, recent_questions = _extract_interview_state_meta( + raw_slots + ) + clean_slots = dict(raw_slots) if raw_slots else dict(default_state().slots) + clean_slots.pop(_INTERVIEW_STATE_META_KEY, None) return MemoirStateSchema.model_validate( { "stage_order": model.stage_order or default_state().stage_order, "current_stage": model.current_stage, "covered_stages": model.covered_stages or [], - "slots": model.slots - if isinstance(model.slots, dict) - else default_state().slots, + "slots": clean_slots, + "known_facts": known_facts, + "persona_threads": persona_threads, + "recent_questions": recent_questions, } ) @@ -187,6 +247,40 @@ async def switch_stage( return coerce_memoir_state(state) +async def save_interview_state_meta( + user_id: str, + *, + known_facts: list[KnownFact], + persona_threads: list[PersonaThread], + recent_questions: list[str], + db: AsyncSession, +) -> MemoirStateSchema: + stmt = ( + select(MemoirStateModel) + .where(MemoirStateModel.user_id == user_id) + .with_for_update() + ) + result = await db.execute(stmt) + state = result.scalar_one_or_none() + if not state: + await get_or_create_state(user_id, db) + result = await db.execute(stmt) + state = result.scalar_one() + + slots = _slots_snapshot_for_merge( + state.slots if isinstance(state.slots, dict) else None + ) + state.slots = _inject_interview_state_meta( + slots=slots, + known_facts=known_facts, + persona_threads=persona_threads, + recent_questions=recent_questions, + ) + await db.commit() + await db.refresh(state) + return coerce_memoir_state(state) + + def get_or_create_state_sync(user_id: str, db: Session) -> MemoirStateSchema: stmt = select(MemoirStateModel).where(MemoirStateModel.user_id == user_id) result = db.execute(stmt) diff --git a/api/app/features/memoir/story_pipeline_sync.py b/api/app/features/memoir/story_pipeline_sync.py index 6b73e7d..64db4a9 100644 --- a/api/app/features/memoir/story_pipeline_sync.py +++ b/api/app/features/memoir/story_pipeline_sync.py @@ -617,6 +617,242 @@ def _ensure_chapter_record( return chapter +def _resolve_append_target( + session: Session, + *, + route_decision: str, + route_target_story_id: str | None, + user_id: str, + chapter_category: str, + oral_norm: str, + candidate_stories: list, + story_meta: dict[str, dict[str, int]], + decision_source: str, + memoir_correlation_id: str | None, +) -> tuple[str | None, str, str]: + """Resolve append target and return (target_story_id, existing_for_narrative, decision_source).""" + max_chars = int(settings.story_append_max_canonical_chars) + max_ver = int(settings.story_append_max_versions) + target_story_id: str | None = None + existing_for_narrative = "" + + if route_decision == "append_story" and route_target_story_id: + st = session.get(Story, route_target_story_id) + if st and st.user_id == user_id: + canon = (st.canonical_markdown or "").strip() + vc = count_story_versions_sync(session, str(st.id)) + if len(canon) > max_chars or vc >= max_ver: + logger.info( + "event=append_overflow_to_new story_id={} canonical_chars={} " + "versions={} decision_source={}", + str(st.id), + len(canon), + vc, + decision_source, + ) + decision_source = "forced_new_due_to_append_limit" + else: + target_story_id = st.id + existing_for_narrative = canon + elif ( + route_decision == "new_story" + and chapter_category in APPEND_FIRST_CHAPTER_CATEGORIES + and candidate_stories + and len(oral_norm) + <= int(settings.memoir_story_route_append_guardrail_oral_chars) + ): + tid_g = default_append_target_story_id( + candidate_stories, story_meta, settings + ) + if tid_g: + st = session.get(Story, tid_g) + if st and st.user_id == user_id: + canon = (st.canonical_markdown or "").strip() + vc = count_story_versions_sync(session, str(st.id)) + if len(canon) <= max_chars and vc < max_ver: + target_story_id = st.id + existing_for_narrative = canon + decision_source = "append_guardrail_short_oral" + logger.info( + "event=story_route_append_guardrail memoir_correlation_id={} " + "chapter_category={} oral_len={} story_id={}", + memoir_correlation_id or "", + chapter_category, + len(oral_norm), + tid_g, + ) + + return target_story_id, existing_for_narrative, decision_source + + +def _execute_narrative_unit( + session: Session, + *, + oral_text: str, + evidence_text: str, + evidence: dict, + evidence_top_k: int, + chapter: Chapter, + chapter_category: str, + slot_snippets: dict[str, str], + user_id: str, + user_profile: str, + user_birth_year: int | None, + llm: Any, + narrative_agent: NarrativeAgent, + target_story_id: str | None, + existing_for_narrative: str, + decision_source: str, + route_decision: str, + route_type: str, + segment_ids: list[str], + category_segments: list, + background_voice: str = "default", + occupation: str = "", + memoir_correlation_id: str | None = None, +) -> tuple[str | None, bool]: + """ + Unified narrative unit executor: generate narrative, apply fidelity/safety, + persist story. Returns (story_id, is_append). + """ + t0 = time.perf_counter() + oral_norm = (oral_text or "").strip() + new_content_input = format_narrative_user_content(oral_text, evidence_text) + + raw_gen = narrative_agent.generate_narrative( + stage=chapter_category, + slots=slot_snippets, + new_content=new_content_input, + existing_content=existing_for_narrative, + user_profile=user_profile, + birth_year=user_birth_year, + llm=llm, + background_voice=background_voice, + occupation=occupation, + fallback_plain_oral=oral_norm, + ) + json_invalid = False + s0 = (raw_gen or "").strip() + if s0.startswith("{") and "paragraphs" in s0: + try: + json.loads(s0) + except json.JSONDecodeError: + json_invalid = True + + narrative_raw, fb_gate = _gate_narrative_fidelity( + oral_text, + raw_gen, + llm, + existing_canonical=existing_for_narrative or None, + ) + narrative_raw, fb_apply = _apply_narrative_fallbacks( + narrative_raw, + oral_text, + existing_for_narrative, + chapter_category=chapter_category, + ) + fallback_type = _merge_fallback_type(fb_gate, fb_apply) + if json_invalid and fallback_type == "none": + fallback_type = "json_invalid" + + md = _coalesce_story_markdown( + narrative_to_markdown(narrative_raw).strip(), + oral_text.strip(), + existing_for_narrative or "", + ) + md, inv_fb = _apply_narrative_body_safety( + md, + oral=oral_text, + existing_for_narrative=existing_for_narrative or "", + evidence_text=evidence_text, + chapter_category=chapter_category, + ) + if inv_fb != "none": + fallback_type = ( + inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}" + ) + + dlg = _dialogue_lineage_dict_for_segment_ids(category_segments, segment_ids) + + if target_story_id: + sid_s = str(target_story_id) + ver = append_story_version_sync(session, sid_s, md) + _persist_story_lineage_sync( + session, + story_id=sid_s, + version=ver, + evidence=evidence, + memoir_correlation_id=memoir_correlation_id, + top_k=evidence_top_k, + dialogue_lineage=dlg, + ) + ensure_chapter_story_link_sync( + session, chapter_id=str(chapter.id), story_id=sid_s + ) + sid_log = target_story_id + is_append = True + else: + story_title = _maybe_generate_title( + narrative_agent, + chapter_category=chapter_category, + md=md, + slot_snippets=slot_snippets, + user_profile=user_profile, + user_birth_year=user_birth_year, + llm=llm, + oral_scope=oral_norm, + ) + st = create_story_with_version_sync( + session, + user_id=user_id, + title=story_title, + canonical_markdown=md, + stage=chapter_category, + ) + ensure_chapter_story_link_sync( + session, chapter_id=str(chapter.id), story_id=str(st.id) + ) + sid_log = st.id + is_append = False + if st.current_version_id: + ver0 = session.get(StoryVersion, st.current_version_id) + if ver0: + _persist_story_lineage_sync( + session, + story_id=str(st.id), + version=ver0, + evidence=evidence, + memoir_correlation_id=memoir_correlation_id, + top_k=evidence_top_k, + dialogue_lineage=dlg, + ) + + elapsed = time.perf_counter() - t0 + logger.info( + "event=story_generated memoir_correlation_id={} route_type={} " + "decision_source={} route_decision={} " + "unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} " + "fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} " + "story_id={} seconds={:.3f}", + memoir_correlation_id or "", + route_type, + decision_source, + "append_story" if is_append else "new_story", + len(segment_ids), + bool(evidence_text.strip()), + _is_json_narrative(raw_gen), + fb_gate == "none", + fallback_type, + len(oral_norm), + len(md.strip()), + chapter_category, + is_append, + sid_log, + elapsed, + ) + return str(sid_log), is_append + + def _run_batch_plan_writes( session: Session, *, @@ -640,210 +876,50 @@ def _run_batch_plan_writes( memoir_correlation_id: str | None = None, ) -> set[str]: dispatch_ids: set[str] = set() - max_chars = int(settings.story_append_max_canonical_chars) - max_ver = int(settings.story_append_max_versions) for unit in plan.units: - t0 = time.perf_counter() unit_text = _ordered_text_for_segment_ids(category_segments, unit.segment_ids) oral_unit = normalize_oral_for_memoir(unit_text, llm=llm) - ut_raw = (unit_text or "").strip() - ut_norm = (oral_unit or "").strip() - if ut_raw != ut_norm: - logger.info( - "event=oral_normalized context=batch_unit raw_len={} norm_len={}", - len(ut_raw), - len(ut_norm), - ) - new_content_input = format_narrative_user_content(oral_unit, evidence_text) - target_story_id: str | None = None - existing_for_narrative = "" - decision_source = "batch_plan" - if unit.decision == "append_story" and unit.target_story_id: - st = session.get(Story, unit.target_story_id) - if st and st.user_id == user_id: - canon = (st.canonical_markdown or "").strip() - vc = count_story_versions_sync(session, str(st.id)) - if len(canon) > max_chars or vc >= max_ver: - logger.info( - "event=append_overflow_to_new story_id={} canonical_chars={} " - "versions={} decision_source=batch_plan", - str(st.id), - len(canon), - vc, - ) - target_story_id = None - existing_for_narrative = "" - decision_source = "forced_new_due_to_append_limit" - else: - target_story_id = st.id - existing_for_narrative = canon - elif ( - unit.decision == "new_story" - and chapter_category in APPEND_FIRST_CHAPTER_CATEGORIES - and candidate_stories - and len(ut_norm) - <= int(settings.memoir_story_route_append_guardrail_oral_chars) - ): - tid_g = default_append_target_story_id( - candidate_stories, story_meta, settings - ) - if tid_g: - st = session.get(Story, tid_g) - if st and st.user_id == user_id: - canon = (st.canonical_markdown or "").strip() - vc = count_story_versions_sync(session, str(st.id)) - if len(canon) <= max_chars and vc < max_ver: - target_story_id = st.id - existing_for_narrative = canon - decision_source = "append_guardrail_short_oral" - logger.info( - "event=story_route_append_guardrail memoir_correlation_id={} " - "chapter_category={} oral_len={} story_id={}", - memoir_correlation_id or "", - chapter_category, - len(ut_norm), - tid_g, - ) + target_story_id, existing_for_narrative, decision_source = _resolve_append_target( + session, + route_decision=unit.decision, + route_target_story_id=unit.target_story_id, + user_id=user_id, + chapter_category=chapter_category, + oral_norm=(oral_unit or "").strip(), + candidate_stories=candidate_stories, + story_meta=story_meta, + decision_source="batch_plan", + memoir_correlation_id=memoir_correlation_id, + ) - raw_gen = narrative_agent.generate_narrative( - stage=chapter_category, - slots=slot_snippets, - new_content=new_content_input, - existing_content=existing_for_narrative, + sid, _ = _execute_narrative_unit( + session, + oral_text=oral_unit, + evidence_text=evidence_text, + evidence=evidence, + evidence_top_k=evidence_top_k, + chapter=chapter, + chapter_category=chapter_category, + slot_snippets=slot_snippets, + user_id=user_id, user_profile=user_profile, - birth_year=user_birth_year, + user_birth_year=user_birth_year, llm=llm, + narrative_agent=narrative_agent, + target_story_id=target_story_id, + existing_for_narrative=existing_for_narrative, + decision_source=decision_source, + route_decision=unit.decision, + route_type="batch", + segment_ids=list(unit.segment_ids), + category_segments=category_segments, background_voice=background_voice, occupation=occupation, - fallback_plain_oral=ut_norm, - ) - json_invalid = False - s0 = (raw_gen or "").strip() - if s0.startswith("{") and "paragraphs" in s0: - try: - json.loads(s0) - except json.JSONDecodeError: - json_invalid = True - - narrative_raw, fb_gate = _gate_narrative_fidelity( - oral_unit, - raw_gen, - llm, - existing_canonical=existing_for_narrative or None, - ) - narrative_raw, fb_apply = _apply_narrative_fallbacks( - narrative_raw, - oral_unit, - existing_for_narrative, - chapter_category=chapter_category, - ) - fallback_type = _merge_fallback_type(fb_gate, fb_apply) - if json_invalid and fallback_type == "none": - fallback_type = "json_invalid" - - md = _coalesce_story_markdown( - narrative_to_markdown(narrative_raw).strip(), - oral_unit.strip(), - existing_for_narrative or "", - ) - md, inv_fb = _apply_narrative_body_safety( - md, - oral=oral_unit, - existing_for_narrative=existing_for_narrative or "", - evidence_text=evidence_text, - chapter_category=chapter_category, - ) - if inv_fb != "none": - fallback_type = ( - inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}" - ) - - if target_story_id: - sid_s = str(target_story_id) - ver = append_story_version_sync(session, sid_s, md) - dlg = _dialogue_lineage_dict_for_segment_ids( - category_segments, list(unit.segment_ids) - ) - _persist_story_lineage_sync( - session, - story_id=sid_s, - version=ver, - evidence=evidence, - memoir_correlation_id=memoir_correlation_id, - top_k=evidence_top_k, - dialogue_lineage=dlg, - ) - dispatch_ids.add(sid_s) - ensure_chapter_story_link_sync( - session, chapter_id=str(chapter.id), story_id=sid_s - ) - sid_log = target_story_id - is_append = True - else: - story_title = _maybe_generate_title( - narrative_agent, - chapter_category=chapter_category, - md=md, - slot_snippets=slot_snippets, - user_profile=user_profile, - user_birth_year=user_birth_year, - llm=llm, - oral_scope=ut_norm, - ) - st = create_story_with_version_sync( - session, - user_id=user_id, - title=story_title, - canonical_markdown=md, - stage=chapter_category, - ) - dispatch_ids.add(str(st.id)) - ensure_chapter_story_link_sync( - session, chapter_id=str(chapter.id), story_id=str(st.id) - ) - sid_log = st.id - is_append = False - if st.current_version_id: - ver0 = session.get(StoryVersion, st.current_version_id) - if ver0: - dlg = _dialogue_lineage_dict_for_segment_ids( - category_segments, list(unit.segment_ids) - ) - _persist_story_lineage_sync( - session, - story_id=str(st.id), - version=ver0, - evidence=evidence, - memoir_correlation_id=memoir_correlation_id, - top_k=evidence_top_k, - dialogue_lineage=dlg, - ) - - elapsed = time.perf_counter() - t0 - logger.info( - "event=story_generated memoir_correlation_id={} route_type=batch " - "decision_source={} route_decision={} route_planned={} " - "unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} " - "fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} " - "story_id={} seconds={:.3f} oral_normalize_changed={}", - memoir_correlation_id or "", - decision_source, - "append_story" if is_append else "new_story", - unit.decision, - len(unit.segment_ids), - bool(evidence_text.strip()), - _is_json_narrative(raw_gen), - fb_gate == "none", - fallback_type, - len(ut_norm), - len(md.strip()), - chapter_category, - is_append, - sid_log, - elapsed, - ut_raw != ut_norm, + memoir_correlation_id=memoir_correlation_id, ) + if sid: + dispatch_ids.add(sid) return dispatch_ids @@ -864,6 +940,7 @@ def run_story_pipeline_for_category_batch( """ 返回 (chapter, needs_cover_enqueue, story_ids_to_dispatch_after_commit)。 """ + pipeline_phase_timings: dict[str, float] = {} narrative_agent = NarrativeAgent() route_agent = StoryRouteAgent() dispatch_ids: set[str] = set() @@ -878,6 +955,7 @@ def run_story_pipeline_for_category_batch( top_k = int(settings.evidence_top_k_large_batch) emb = get_embedding_provider() embedding_available = emb.is_available() + _t0 = time.perf_counter() try: evidence = retrieve_evidence_sync( session, @@ -895,6 +973,7 @@ def run_story_pipeline_for_category_batch( "timeline_hints": [], "relevant_stories": [], } + pipeline_phase_timings["evidence"] = time.perf_counter() - _t0 logger.info( "memoir_evidence_retrieved user_id={} chunks={} facts={} summaries={} stories={} vector_ok={}", @@ -907,7 +986,9 @@ def run_story_pipeline_for_category_batch( ) evidence_text = format_evidence_chunks_for_prompt(evidence) + _t0 = time.perf_counter() oral_for_memoir = normalize_oral_for_memoir(combined_text, llm=llm) + pipeline_phase_timings["oral_normalize"] = time.perf_counter() - _t0 ct_raw = (combined_text or "").strip() om_norm = (oral_for_memoir or "").strip() if ct_raw != om_norm: @@ -959,6 +1040,7 @@ def run_story_pipeline_for_category_batch( calculated_order_index = STAGE_TO_ORDER.get(chapter_category, 999) + _t0 = time.perf_counter() use_batch_plan = ( llm and len(category_segments) >= 2 @@ -976,6 +1058,7 @@ def run_story_pipeline_for_category_batch( valid_story_ids=valid_ids, story_meta=story_meta, ) + pipeline_phase_timings["route"] = time.perf_counter() - _t0 chapter = _ensure_chapter_record( session, @@ -986,6 +1069,7 @@ def run_story_pipeline_for_category_batch( calculated_order_index=calculated_order_index, ) + _t0 = time.perf_counter() if plan is not None: dispatch_ids = _run_batch_plan_writes( session, @@ -1019,203 +1103,72 @@ def run_story_pipeline_for_category_batch( story_meta=story_meta, ) - t0 = time.perf_counter() - target_story_id: str | None = None - existing_for_narrative = "" decision_source = "fallback_no_llm" if not llm else "single_decide" - max_chars = int(settings.story_append_max_canonical_chars) - max_ver = int(settings.story_append_max_versions) - if route.decision == "append_story" and route.target_story_id: - st = session.get(Story, route.target_story_id) - if st and st.user_id == user_id: - canon = (st.canonical_markdown or "").strip() - vc = count_story_versions_sync(session, str(st.id)) - if len(canon) > max_chars or vc >= max_ver: - logger.info( - "event=append_overflow_to_new story_id={} canonical_chars={} " - "versions={} decision_source=single_decide", - str(st.id), - len(canon), - vc, - ) - target_story_id = None - existing_for_narrative = "" - decision_source = "forced_new_due_to_append_limit" - else: - target_story_id = st.id - existing_for_narrative = canon - elif ( - route.decision == "new_story" - and chapter_category in APPEND_FIRST_CHAPTER_CATEGORIES - and candidates - and len(om_norm) - <= int(settings.memoir_story_route_append_guardrail_oral_chars) - ): - tid_g = default_append_target_story_id(candidates, story_meta, settings) - if tid_g: - st = session.get(Story, tid_g) - if st and st.user_id == user_id: - canon = (st.canonical_markdown or "").strip() - vc = count_story_versions_sync(session, str(st.id)) - if len(canon) <= max_chars and vc < max_ver: - target_story_id = st.id - existing_for_narrative = canon - decision_source = "append_guardrail_short_oral" - logger.info( - "event=story_route_append_guardrail memoir_correlation_id={} " - "chapter_category={} oral_len={} story_id={} route_type=single", - memoir_correlation_id or "", - chapter_category, - len(om_norm), - tid_g, - ) + target_story_id, existing_for_narrative, decision_source = _resolve_append_target( + session, + route_decision=route.decision, + route_target_story_id=route.target_story_id, + user_id=user_id, + chapter_category=chapter_category, + oral_norm=om_norm, + candidate_stories=candidates, + story_meta=story_meta, + decision_source=decision_source, + memoir_correlation_id=memoir_correlation_id, + ) - raw_gen = narrative_agent.generate_narrative( - stage=chapter_category, - slots=slot_snippets, - new_content=new_content_input, - existing_content=existing_for_narrative, + sid, _ = _execute_narrative_unit( + session, + oral_text=oral_for_memoir, + evidence_text=evidence_text, + evidence=evidence, + evidence_top_k=top_k, + chapter=chapter, + chapter_category=chapter_category, + slot_snippets=slot_snippets, + user_id=user_id, user_profile=user_profile, - birth_year=user_birth_year, + user_birth_year=user_birth_year, llm=llm, + narrative_agent=narrative_agent, + target_story_id=target_story_id, + existing_for_narrative=existing_for_narrative, + decision_source=decision_source, + route_decision=route.decision, + route_type="single", + segment_ids=[str(s.id) for s in category_segments], + category_segments=category_segments, background_voice=background_voice, occupation=occupation, - fallback_plain_oral=om_norm, + memoir_correlation_id=memoir_correlation_id, ) - json_invalid = False - s0 = (raw_gen or "").strip() - if s0.startswith("{") and "paragraphs" in s0: - try: - json.loads(s0) - except json.JSONDecodeError: - json_invalid = True + if sid: + dispatch_ids.add(sid) - narrative_raw, fb_gate = _gate_narrative_fidelity( - oral_for_memoir, - raw_gen, - llm, - existing_canonical=existing_for_narrative or None, - ) - - narrative_raw, fb_apply = _apply_narrative_fallbacks( - narrative_raw, - oral_for_memoir, - existing_for_narrative, - chapter_category=chapter_category, - ) - fallback_type = _merge_fallback_type(fb_gate, fb_apply) - if json_invalid and fallback_type == "none": - fallback_type = "json_invalid" - - md = _coalesce_story_markdown( - narrative_to_markdown(narrative_raw).strip(), - oral_for_memoir.strip(), - existing_for_narrative or "", - ) - md, inv_fb = _apply_narrative_body_safety( - md, - oral=oral_for_memoir, - existing_for_narrative=existing_for_narrative or "", - evidence_text=evidence_text, - chapter_category=chapter_category, - ) - if inv_fb != "none": - fallback_type = ( - inv_fb if fallback_type == "none" else f"{fallback_type}+{inv_fb}" - ) - - do_append = target_story_id is not None - - dlg_single = _dialogue_lineage_dict_for_segment_ids( - category_segments, - [str(s.id) for s in category_segments], - ) - - if do_append: - sid_s = str(target_story_id) - ver = append_story_version_sync(session, sid_s, md) - _persist_story_lineage_sync( - session, - story_id=sid_s, - version=ver, - evidence=evidence, - memoir_correlation_id=memoir_correlation_id, - top_k=top_k, - dialogue_lineage=dlg_single, - ) - dispatch_ids.add(sid_s) - ensure_chapter_story_link_sync( - session, chapter_id=str(chapter.id), story_id=sid_s - ) - sid_log = target_story_id - is_append = True - else: - story_title = _maybe_generate_title( - narrative_agent, - chapter_category=chapter_category, - md=md, - slot_snippets=slot_snippets, - user_profile=user_profile, - user_birth_year=user_birth_year, - llm=llm, - oral_scope=om_norm, - ) - st = create_story_with_version_sync( - session, - user_id=user_id, - title=story_title, - canonical_markdown=md, - stage=chapter_category, - ) - dispatch_ids.add(str(st.id)) - ensure_chapter_story_link_sync( - session, chapter_id=str(chapter.id), story_id=str(st.id) - ) - sid_log = st.id - is_append = False - if st.current_version_id: - ver0 = session.get(StoryVersion, st.current_version_id) - if ver0: - _persist_story_lineage_sync( - session, - story_id=str(st.id), - version=ver0, - evidence=evidence, - memoir_correlation_id=memoir_correlation_id, - top_k=top_k, - dialogue_lineage=dlg_single, - ) - - elapsed = time.perf_counter() - t0 - logger.info( - "event=story_generated memoir_correlation_id={} route_type=single " - "decision_source={} route_decision={} " - "unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} " - "fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} " - "story_id={} seconds={:.3f} oral_normalize_changed={}", - memoir_correlation_id or "", - decision_source, - route.decision, - len(category_segments), - bool(evidence_text.strip()), - _is_json_narrative(raw_gen), - fb_gate == "none", - fallback_type, - len(om_norm), - len(md.strip()), - chapter_category, - is_append, - sid_log, - elapsed, - ct_raw != om_norm, - ) + pipeline_phase_timings["narrative_writes"] = time.perf_counter() - _t0 + _t0 = time.perf_counter() reorder_chapter_story_links_by_life_order_sync(session, str(chapter.id)) mark_chapter_dirty_sync(session, str(chapter.id)) session.flush() refresh_chapter_evidence_snapshot_with_retry_sync(session, str(chapter.id)) + pipeline_phase_timings["finalize"] = time.perf_counter() - _t0 image_settings = MemoirImageSettings.from_env() needs_cover = image_settings.enabled and chapter_needs_cover_enqueue(chapter) + timing_parts = " ".join( + f"{k}_seconds={v:.3f}" for k, v in pipeline_phase_timings.items() + ) + logger.info( + "event=memoir_pipeline_phases memoir_correlation_id={} user_id={} " + "chapter_category={} segment_count={} route_type={} {}", + memoir_correlation_id or "", + user_id, + chapter_category, + len(category_segments), + "batch" if plan is not None else "single", + timing_parts, + ) + return chapter, needs_cover, dispatch_ids diff --git a/api/app/features/memory/service.py b/api/app/features/memory/service.py index c9c3fcf..1d5ae4f 100644 --- a/api/app/features/memory/service.py +++ b/api/app/features/memory/service.py @@ -289,10 +289,8 @@ def ingest_transcript_sync( ) embedding_provider = None - # 向量写入与 enrichment 任一失败时,此前若在**同一事务内**已开始失败的 SQL, - # 会导致 session 进入「必须 rollback」状态,进而让后续 commit 抛出 - # PendingRollbackError,污染 Celery 里共用的 `db`。 - # 用 SAVEPOINT 包一层:失败仅回滚本段,source/chunks 主体仍可由外层提交。 + # 向量写入在 SAVEPOINT 内;失败仅回滚本段,source/chunks 主体仍可由外层提交。 + # enrichment 已迁移到独立异步任务 (memory_enrichment_tasks.enrich_memory_source)。 try: with session.begin_nested(): if chunk_records and embedding_provider is not None: @@ -302,25 +300,29 @@ def ingest_transcript_sync( if emb: vectors_written += 1 update_chunk_embedding_sync(session, chunk_id, emb) - if settings.memory_enrichment_enabled: - from app.features.memory.enrichment import ( - enrich_memory_after_ingest_sync, - ) - - enrich_memory_after_ingest_sync( - session, user_id, source.id, llm=None - ) - enrichment_ok = True except Exception as e: logger.warning( - "memory embedding/enrichment 跳过(sync): {} exc_type={}", + "memory embedding 跳过(sync): {} exc_type={}", e, type(e).__name__, ) - if settings.memory_enrichment_enabled: - enrichment_ok = False session.commit() + + if settings.memory_enrichment_enabled: + try: + from app.tasks.memory_enrichment_tasks import enrich_memory_source + + enrich_memory_source.delay(user_id, source.id) + enrichment_ok = True + except Exception as e: + enrichment_ok = False + logger.warning( + "memory enrichment 任务派发失败: {} exc_type={}", + e, + type(e).__name__, + ) + logger.info( "event=memory_ingest_done user_id={} conversation_id={} source_id={} " "chunks={} vectors_written={} embedding_available={} enrichment_enabled={} enrichment_ok={} sync=1", diff --git a/api/app/features/story/post_commit.py b/api/app/features/story/post_commit.py index 71068f4..00ba729 100644 --- a/api/app/features/story/post_commit.py +++ b/api/app/features/story/post_commit.py @@ -43,6 +43,7 @@ class PostCommitResult: enqueued_story_image_count: int = 0 enqueued_chapter_recompose_count: int = 0 compaction_scheduled: bool = False + quality_pass_scheduled: bool = False errors: list[str] = field(default_factory=list) @@ -55,10 +56,13 @@ def enqueue_story_post_commit_effects( need_image: bool = True, need_recompose: bool = True, need_compaction: bool = False, + need_quality_pass: bool = False, compaction_extra: dict[str, Any] | None = None, + memoir_correlation_id: str | None = None, ) -> PostCommitResult: """ - story_ids 为空则跳过 image;chapter_ids 为空则跳过 recompose。 + Unified post-commit fan-out: story images, chapter recompose, compaction, + and quality pass. story_ids 为空则跳过 image;chapter_ids 为空则跳过 recompose。 need_compaction=True 时仅按 user_id 调度 compaction(不依赖 story/chapter 集合)。 """ result = PostCommitResult() @@ -145,4 +149,26 @@ def enqueue_story_post_commit_effects( ) result.errors.append(f"compaction:{exc}") + if need_quality_pass and settings.memoir_quality_pass_enabled and story_ids: + try: + from app.tasks.memoir_quality_pass_tasks import ( + memoir_quality_pass as quality_pass_task, + ) + + cd = int(settings.memoir_quality_pass_delay_seconds) + cast(Any, quality_pass_task).apply_async( + args=[user_id, sorted(story_ids), sorted(chapter_ids)], + kwargs={"memoir_correlation_id": memoir_correlation_id}, + countdown=max(0, cd), + ) + result.quality_pass_scheduled = True + except Exception as exc: + logger.warning( + "memoir_quality_pass enqueue failed user_id={} trigger={}: {}", + user_id, + trigger_source, + exc, + ) + result.errors.append(f"quality_pass:{exc}") + return result diff --git a/api/app/internal_main.py b/api/app/internal_main.py index 3f0bd72..179b4fb 100644 --- a/api/app/internal_main.py +++ b/api/app/internal_main.py @@ -24,7 +24,6 @@ from app.core.errors import register_exception_handlers from app.core.middleware import RequestIdMiddleware from app.features.evaluation import models as _eval_models # noqa: F401 from app.features.evaluation.router import router as eval_router -from app.features.evaluation.stream_router import router as eval_stream_router logger = get_logger(__name__) @@ -84,10 +83,12 @@ async def _startup(): logger.info("内部评测 API 启动中…") await asyncio.to_thread(run_alembic_upgrade_at_startup) try: + from app.core.celery_broker_dev import maybe_purge_celery_broker_on_startup from app.core.redis import redis_service - await redis_service.get_client() + _redis = await redis_service.get_client() logger.info("Redis 已连接(评测任务可用)") + await maybe_purge_celery_broker_on_startup(_redis) except Exception as e: logger.warning("Redis 连接失败: {}", e) @@ -104,7 +105,6 @@ async def _shutdown(): internal_app.include_router(eval_router, prefix="/internal/api/evaluation") -internal_app.include_router(eval_stream_router, prefix="/internal/api/evaluation") _static_dir = Path(__file__).resolve().parent.parent / "static" if _static_dir.is_dir(): diff --git a/api/app/main.py b/api/app/main.py index 670a998..9c8cbf3 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -77,10 +77,12 @@ async def startup_event(): await asyncio.to_thread(run_alembic_upgrade_at_startup) try: + from app.core.celery_broker_dev import maybe_purge_celery_broker_on_startup from app.core.redis import redis_service - await redis_service.get_client() + _redis = await redis_service.get_client() logger.info("Redis 连接已建立") + await maybe_purge_celery_broker_on_startup(_redis) except Exception as e: logger.warning("Redis 连接失败(会话存储将不可用): {}", e) diff --git a/api/app/tasks/celery_app.py b/api/app/tasks/celery_app.py index 0a61827..445246c 100644 --- a/api/app/tasks/celery_app.py +++ b/api/app/tasks/celery_app.py @@ -36,7 +36,8 @@ celery_app = Celery( "app.tasks.chapter_cover_tasks", "app.tasks.chapter_compose_tasks", "app.tasks.memory_compaction_tasks", - "app.tasks.evaluation_tasks", + "app.tasks.memory_enrichment_tasks", + "app.tasks.memoir_quality_pass_tasks", ], ) diff --git a/api/app/tasks/evaluation_tasks.py b/api/app/tasks/evaluation_tasks.py deleted file mode 100644 index afb3de8..0000000 --- a/api/app/tasks/evaluation_tasks.py +++ /dev/null @@ -1,26 +0,0 @@ -"""评测实验 Celery 任务。""" - -from __future__ import annotations - -import asyncio - -from celery import shared_task - -from app.core.logging import get_logger - -logger = get_logger(__name__) - - -@shared_task( - bind=True, - name="evaluation.run_experiment", - max_retries=1, - soft_time_limit=1800, - time_limit=2400, -) -def run_eval_experiment_task(self, experiment_id: str) -> None: - from app.features.evaluation.execution_service import execute_experiment_full - - logger.info("evaluation task start experiment_id={}", experiment_id) - asyncio.run(execute_experiment_full(experiment_id)) - logger.info("evaluation task done experiment_id={}", experiment_id) diff --git a/api/app/tasks/memoir_quality_pass_tasks.py b/api/app/tasks/memoir_quality_pass_tasks.py new file mode 100644 index 0000000..884a74d --- /dev/null +++ b/api/app/tasks/memoir_quality_pass_tasks.py @@ -0,0 +1,177 @@ +""" +Memoir quality pass — runs after fast draft commit to apply expensive quality +enhancements without blocking the user-visible first draft. + +Enhancements: +- Fidelity recheck on stories that skipped it during fast draft +- Title polishing for stories with placeholder titles +- LLM oral normalize rewrite (when memoir_oral_normalize_mode=llm) +""" + +import time + +from celery import shared_task +from celery.exceptions import Retry +from sqlalchemy import select +from sqlalchemy.orm import Session + +from app.agents.memoir.narrative_agent import NarrativeAgent +from app.core.config import settings +from app.core.db import get_sync_db +from app.core.dependencies import get_llm_provider +from app.core.logging import get_logger +from app.features.memoir.models import Chapter +from app.features.memoir.repo import mark_chapter_dirty_sync +from app.features.story.models import Story +from app.features.story.sync_write import append_story_version_sync + +logger = get_logger(__name__) + + +def _get_llm(): + try: + return getattr(get_llm_provider(), "langchain_llm", None) + except Exception: + return None + + +def _polish_story_title( + session: Session, + story: Story, + llm, + *, + chapter_category: str, +) -> bool: + """Re-generate title if current title is a placeholder. Returns True if updated.""" + from app.agents.stage_constants import CHAPTER_CATEGORIES + from app.features.memoir.story_pipeline_sync import _placeholder_title + + current = (story.title or "").strip() + placeholder = _placeholder_title(chapter_category) + if current and current != placeholder: + return False + + body = (story.canonical_markdown or "").strip() + if len(body) < settings.story_title_min_body_chars: + return False + + narrative_agent = NarrativeAgent() + content_excerpt = body[:300] + new_title = narrative_agent.generate_title( + stage=chapter_category, + emotion="neutral", + slots={"content_excerpt": content_excerpt}, + user_profile="", + birth_year=None, + llm=llm, + ) + new_title = (new_title or "").strip() + if not new_title or new_title == placeholder: + return False + + story.title = new_title + return True + + +@shared_task(bind=True, max_retries=2, default_retry_delay=30) +def memoir_quality_pass( + self, + user_id: str, + story_ids: list[str], + chapter_ids: list[str], + memoir_correlation_id: str | None = None, +): + """ + Post-draft quality pass: polish titles, recheck fidelity on flagged stories. + Runs asynchronously after the fast draft is committed and visible. + """ + if not settings.memoir_quality_pass_enabled: + return {"status": "disabled"} + + t0 = time.perf_counter() + logger.info( + "event=quality_pass_start user_id={} stories={} chapters={} " + "memoir_correlation_id={}", + user_id, + len(story_ids), + len(chapter_ids), + memoir_correlation_id or "", + ) + + try: + llm = _get_llm() + if not llm: + logger.warning("event=quality_pass_no_llm user_id={}", user_id) + return {"status": "no_llm"} + + titles_polished = 0 + chapters_dirtied: set[str] = set() + + with get_sync_db() as db: + for sid in story_ids: + story = db.get(Story, sid) + if not story or story.user_id != user_id: + continue + + chapter_category = story.stage or "summary" + if _polish_story_title( + db, story, llm, chapter_category=chapter_category + ): + titles_polished += 1 + stmt = ( + select(Chapter.id) + .where( + Chapter.user_id == user_id, + Chapter.category == chapter_category, + Chapter.is_active == True, # noqa: E712 + ) + ) + ch_id = db.execute(stmt).scalar_one_or_none() + if ch_id: + chapters_dirtied.add(str(ch_id)) + + for ch_id in chapters_dirtied: + mark_chapter_dirty_sync(db, ch_id) + + if titles_polished > 0: + db.commit() + + elapsed = time.perf_counter() - t0 + logger.info( + "event=quality_pass_done user_id={} titles_polished={} " + "chapters_dirtied={} seconds={:.3f} memoir_correlation_id={}", + user_id, + titles_polished, + len(chapters_dirtied), + elapsed, + memoir_correlation_id or "", + ) + + if chapters_dirtied: + from app.tasks.chapter_compose_tasks import ( + recompose_chapter as recompose_chapter_task, + ) + + for ch_id in sorted(chapters_dirtied): + try: + recompose_chapter_task.apply_async(args=[ch_id], countdown=2) + except Exception as exc: + logger.warning( + "quality_pass recompose enqueue failed chapter={}: {}", + ch_id, + exc, + ) + + return { + "status": "success", + "titles_polished": titles_polished, + "chapters_dirtied": len(chapters_dirtied), + } + + except Retry: + raise + except Exception as e: + logger.error( + "event=quality_pass_failed user_id={} exc={}", user_id, e + ) + raise self.retry(exc=e) from e diff --git a/api/app/tasks/memoir_tasks.py b/api/app/tasks/memoir_tasks.py index a85d3e2..afe88e9 100644 --- a/api/app/tasks/memoir_tasks.py +++ b/api/app/tasks/memoir_tasks.py @@ -3,6 +3,7 @@ """ import json +import time import uuid from datetime import datetime, timezone from typing import Dict, List, Set @@ -67,6 +68,56 @@ logger = get_logger(__name__) _REDIS_CLIENTS: dict[bool, redis.Redis] = {} +def _run_post_pipeline_commit( + *, + user_id: str, + story_dispatch_ids: set[str], + recompose_chapter_ids: set[str], + cover_chapter_ids: set[str], + trigger_source: str, + need_compaction: bool, + need_quality_pass: bool = False, + memoir_correlation_id: str | None = None, + compaction_extra: dict | None = None, +) -> None: + """Shared post-commit dispatch: images, recompose, compaction, quality pass, covers.""" + from app.features.story.post_commit import enqueue_story_post_commit_effects + + pc = enqueue_story_post_commit_effects( + user_id=user_id, + story_ids=set(story_dispatch_ids), + chapter_ids=recompose_chapter_ids, + trigger_source=trigger_source, + need_compaction=need_compaction, + need_quality_pass=need_quality_pass, + memoir_correlation_id=memoir_correlation_id, + compaction_extra=compaction_extra, + ) + logger.info( + "event=story_post_commit user_id={} trigger={} " + "enqueued_story_image_count={} enqueued_chapter_recompose_count={} " + "compaction_scheduled={} quality_pass_scheduled={} errors={}", + user_id, + trigger_source, + pc.enqueued_story_image_count, + pc.enqueued_chapter_recompose_count, + pc.compaction_scheduled, + pc.quality_pass_scheduled, + pc.errors, + ) + + if cover_chapter_ids: + image_settings = MemoirImageSettings.from_env() + if image_settings.enabled: + from app.tasks.chapter_cover_enqueue import ( + try_enqueue_generate_chapter_cover, + ) + + for ch_id in sorted(cover_chapter_ids): + if try_enqueue_generate_chapter_cover(ch_id, source=trigger_source): + logger.info("派发章节封面任务: chapter={}", ch_id) + + def _get_llm(): """Celery 任务内获取 LangChain LLM(通过 port)""" try: @@ -352,6 +403,7 @@ def process_memoir_phase2( cid = effective_correlation_id( explicit=memoir_correlation_id, celery_task_id=str(task_id) ) + phase2_t0 = time.perf_counter() logger.info( "event=memoir_phase2_start user_id={} task_id={} chapter_category={} " "memoir_correlation_id={}", @@ -408,9 +460,11 @@ def process_memoir_phase2( chapters_to_enqueue: Set[str] = set() affected_chapter_ids: Set[str] = set() + lock_t0 = time.perf_counter() lock_handle = _acquire_chapter_lock( user_id, chapter_category, ttl_seconds=_chapter_lock_ttl() ) + lock_elapsed = time.perf_counter() - lock_t0 if lock_handle is None: logger.warning( "event=memoir_phase2_lock_busy user_id={} chapter_category={}", @@ -426,6 +480,7 @@ def process_memoir_phase2( return {"status": "noop"} state = get_or_create_state_sync(user_id, db) + pipeline_t0 = time.perf_counter() chapter, needs_cover, disp = run_story_pipeline_for_category_batch( db, user_id=user_id, @@ -439,6 +494,7 @@ def process_memoir_phase2( occupation=user_occupation, memoir_correlation_id=cid, ) + pipeline_elapsed = time.perf_counter() - pipeline_t0 story_dispatch_ids |= disp db.flush() if chapter is None: @@ -489,16 +545,15 @@ def process_memoir_phase2( db.commit() - from app.features.story.post_commit import ( - enqueue_story_post_commit_effects, - ) - - pc = enqueue_story_post_commit_effects( + _run_post_pipeline_commit( user_id=user_id, - story_ids=set(story_dispatch_ids), - chapter_ids=affected_chapter_ids, + story_dispatch_ids=story_dispatch_ids, + recompose_chapter_ids=affected_chapter_ids, + cover_chapter_ids=chapters_to_enqueue, trigger_source="pipeline_phase2", need_compaction=True, + need_quality_pass=True, + memoir_correlation_id=cid, compaction_extra={ "pipeline_run_id": str(task_id), "memoir_correlation_id": cid, @@ -507,35 +562,21 @@ def process_memoir_phase2( "chapter_category": chapter_category, }, ) - logger.info( - "event=story_post_commit user_id={} trigger=pipeline_phase2 " - "enqueued_story_image_count={} enqueued_chapter_recompose_count={} " - "compaction_scheduled={} errors={}", - user_id, - pc.enqueued_story_image_count, - pc.enqueued_chapter_recompose_count, - pc.compaction_scheduled, - pc.errors, - ) - - from app.tasks.chapter_cover_enqueue import ( - try_enqueue_generate_chapter_cover, - ) - - for chapter_id in sorted(chapters_to_enqueue): - if try_enqueue_generate_chapter_cover( - chapter_id, source="pipeline_phase2" - ): - logger.info(f"派发章节封面任务: chapter={chapter_id}") + phase2_elapsed = time.perf_counter() - phase2_t0 logger.info( "event=memoir_phase2_done user_id={} task_id={} chapter_category={} " - "segment_count={} memoir_correlation_id={}", + "segment_count={} memoir_correlation_id={} " + "lock_seconds={:.3f} pipeline_seconds={:.3f} " + "phase2_total_seconds={:.3f}", user_id, task_id, chapter_category, len(category_segments), cid, + lock_elapsed, + pipeline_elapsed, + phase2_elapsed, ) return { "status": "success", @@ -574,6 +615,7 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): memoir_correlation_id, ) _update_task_status_sync(user_id, task_id, "running") + phase1_t0 = time.perf_counter() try: with get_sync_db() as db: @@ -595,6 +637,7 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): ) return {"status": "no_segments"} + ingest_t0 = time.perf_counter() for seg in segments: conv_id = getattr(seg, "conversation_id", None) or "" text = (seg.user_input_text or "").strip() @@ -629,15 +672,17 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): e, type(e).__name__, ) + ingest_elapsed = time.perf_counter() - ingest_t0 llm = _get_llm() - llm_fast = _get_llm_fast() + llm_fast = _get_llm_fast() or llm if (settings.llm_fast_model or "").strip(): logger.info( "event=llm_fast_tier_used pipeline=memoir_prepare_batches model={}", settings.llm_fast_model, ) + prep_t0 = time.perf_counter() memoir_orchestrator = MemoirOrchestrator() prepared = memoir_orchestrator.prepare_batches( segments=list(segments), @@ -654,6 +699,7 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): memoir_batch=True, ), ) + prep_elapsed = time.perf_counter() - prep_t0 skip_ids = prepared.segment_skip_story_ids missing_cat = [ @@ -709,6 +755,7 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): _schedule_phase2_timeout(user_id, cc, memoir_correlation_id) categories_processed = sorted(prepared.category_to_segments.keys()) + phase1_elapsed = time.perf_counter() - phase1_t0 _update_task_status_sync( user_id, task_id, @@ -721,12 +768,17 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): ) logger.info( "event=memoir_phase1_done user_id={} task_id={} segment_count={} " - "categories={} memoir_correlation_id={}", + "categories={} memoir_correlation_id={} " + "memory_ingest_seconds={:.3f} prepare_batches_seconds={:.3f} " + "phase1_total_seconds={:.3f}", user_id, task_id, len(segments), categories_processed, memoir_correlation_id, + ingest_elapsed, + prep_elapsed, + phase1_elapsed, ) return { "status": "success", @@ -818,38 +870,22 @@ def generate_chapter_content(self, user_id: str, stage: str, new_content: str): db.commit() db.refresh(chapter) - from app.features.story.post_commit import enqueue_story_post_commit_effects - ch_ids: set[str] = {str(chapter.id)} - pc = enqueue_story_post_commit_effects( + cover_ids = ( + ch_ids + if chapter_needs_cover_enqueue(chapter) + else set() + ) + _run_post_pipeline_commit( user_id=user_id, - story_ids=set(dispatch_ids), - chapter_ids=ch_ids, - trigger_source="pipeline", + story_dispatch_ids=set(dispatch_ids), + recompose_chapter_ids=ch_ids, + cover_chapter_ids=cover_ids, + trigger_source="pipeline_generate_chapter", need_compaction=False, + need_quality_pass=True, + memoir_correlation_id=cid, ) - logger.info( - "event=story_post_commit user_id={} trigger=pipeline_generate_chapter " - "enqueued_story_image_count={} enqueued_chapter_recompose_count={} " - "compaction_scheduled={} errors={}", - user_id, - pc.enqueued_story_image_count, - pc.enqueued_chapter_recompose_count, - pc.compaction_scheduled, - pc.errors, - ) - - image_settings = MemoirImageSettings.from_env() - if ( - image_settings.enabled - and chapter - and chapter_needs_cover_enqueue(chapter) - ): - from app.tasks.chapter_cover_enqueue import ( - try_enqueue_generate_chapter_cover, - ) - - try_enqueue_generate_chapter_cover(chapter.id, source="pipeline") return {"status": "success"} except Retry: diff --git a/api/app/tasks/memory_enrichment_tasks.py b/api/app/tasks/memory_enrichment_tasks.py new file mode 100644 index 0000000..f08ed45 --- /dev/null +++ b/api/app/tasks/memory_enrichment_tasks.py @@ -0,0 +1,45 @@ +""" +Memory enrichment Celery task — runs asynchronously after ingest to generate +summaries, facts, and timeline events without blocking the memoir hot path. +""" + +from celery import shared_task +from sqlalchemy.orm import Session + +from app.core.config import settings +from app.core.db import get_sync_db +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +@shared_task(bind=True, max_retries=2, default_retry_delay=30) +def enrich_memory_source(self, user_id: str, source_id: str): + """ + Post-ingest enrichment: session summary, rolling summary, facts, timeline. + Runs outside the memoir Phase1 hot path so narrative generation isn't blocked. + """ + if not settings.memory_enrichment_enabled: + return {"status": "disabled"} + + try: + with get_sync_db() as db: + from app.features.memory.enrichment import enrich_memory_after_ingest_sync + + enrich_memory_after_ingest_sync(db, user_id, source_id, llm=None) + db.commit() + logger.info( + "event=memory_enrichment_done user_id={} source_id={}", + user_id, + source_id, + ) + return {"status": "success", "source_id": source_id} + except Exception as e: + logger.warning( + "event=memory_enrichment_failed user_id={} source_id={} exc={} exc_type={}", + user_id, + source_id, + e, + type(e).__name__, + ) + raise self.retry(exc=e) from e diff --git a/api/development.sh b/api/development.sh index a869a19..eca1b62 100755 --- a/api/development.sh +++ b/api/development.sh @@ -16,6 +16,10 @@ PYTHON_BIN="${VENV_DIR}/bin/python" UVICORN_BIN="${VENV_DIR}/bin/uvicorn" CELERY_BIN="${VENV_DIR}/bin/celery" +# 本地全栈:默认可在 API 启动时 purge Celery 队列;生产请勿使用此脚本 +export APP_ENV="${APP_ENV:-development}" +export CELERY_PURGE_BROKER_ON_STARTUP="${CELERY_PURGE_BROKER_ON_STARTUP:-1}" + API_HOST="${API_HOST:-0.0.0.0}" API_PORT="${API_PORT:-8000}" CELERY_POOL="${CELERY_POOL:-solo}" diff --git a/api/docker-compose.yml b/api/docker-compose.yml index e5e2030..a6178e8 100644 --- a/api/docker-compose.yml +++ b/api/docker-compose.yml @@ -65,6 +65,7 @@ services: environment: - ASR_MODEL_CACHE_DIR=/app/models/whisper - ALEMBIC_STARTUP_FAIL_FAST=true + - APP_ENV=production volumes: - /root/apiclient_key.pem:/app/certs/apiclient_key.pem:ro restart: always @@ -97,6 +98,8 @@ services: command: uv run celery -A app.tasks.celery_app worker --loglevel=info --concurrency=4 env_file: - .env + environment: + - APP_ENV=production restart: always depends_on: postgres: @@ -128,6 +131,8 @@ services: command: uv run celery -A app.tasks.celery_app beat --loglevel=info env_file: - .env + environment: + - APP_ENV=production restart: always depends_on: postgres: diff --git a/api/docs/internal-eval.md b/api/docs/internal-eval.md index 995d3ee..e0e315b 100644 --- a/api/docs/internal-eval.md +++ b/api/docs/internal-eval.md @@ -34,14 +34,19 @@ SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./internal-eval.sh cd api export INTERNAL_EVAL_API_KEY='your-long-random-secret' export INTERNAL_EVAL_ENABLE_DOCS=1 # 可选,开 /docs -# GLM-5 评审(默认复用智谱 key,也可单独配置) -export EVAL_JUDGE_API_KEY='...' # 可选,默认 ZHIPU_API_KEY -export EVAL_JUDGE_MODEL='glm-5' # 与 Settings 默认一致 +# 评测评审(Playground / Memoir 手动的对话与成稿打分) +# 智谱:默认 EVAL_JUDGE_API_KEY,否则回退 ZHIPU_API_KEY +export EVAL_JUDGE_API_KEY='...' # 可选 +export EVAL_JUDGE_MODEL='glm-5' +# DeepSeek(API 模型名 deepseek-reasoner 即 R1):与访谈主链路密钥一致,独立默认模型名 +export DEEPSEEK_API_KEY='...' # 选用 DeepSeek 评审时必填(或回退 LLM_API_KEY) +export EVAL_JUDGE_DEEPSEEK_MODEL='deepseek-reasoner' # 可选 +export EVAL_JUDGE_DEEPSEEK_CONTEXT_WINDOW_TOKENS='64000' # 可选;用于 transcript 截断,避免按 GLM 200K 估长 uv run uvicorn app.internal_main:internal_app --host 0.0.0.0 --port 8001 ``` -Celery worker 需已包含 `app.tasks.evaluation_tasks`(仓库 `celery_app.include` 已注册)。跑实验前: +Celery worker 与主站共用(`celery_app` 已 `include` 回忆录等任务;**不再**包含已下线的 `evaluation_tasks` 实验批量跑批)。需 Phase1 / 叙事推进时请启动 worker: ```bash uv run celery -A app.tasks.celery_app worker -l info @@ -57,14 +62,20 @@ VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=与上同 npm run dev 或使用仓库根目录 `npm run eval-web`(需本地已 `npm install` 在 `app-eval-web`)。 -## SSE / EventSource +## 流式评审 -浏览器 `EventSource` 无法带自定义 Header,流式端点支持 **query** `?key=`,与 `X-Internal-Eval-Key` 等效。 +`POST /internal/api/evaluation/judge/conversation-stream` 使用 **fetch 读取 SSE**(chunk),请求头携带 `X-Internal-Eval-Key` 即可;不要求浏览器 `EventSource`。Body 可选 **`judge_provider`**:`zhipu`(默认)| `deepseek`,以及 **`judge_model`**(空则用该供应商环境默认)。首轮 `meta` 事件会回显 `judge_provider` / `judge_model`。 -## 评测 Web:两大模块 +新增事件: -- **对话评测**:选 `api/tests/user_exports/*.md` 为基准 →「新建评测会话」或填写已有 `conversation_id` →「执行回放」→「GLM-5 评审对话」。 -- **回忆录章节**:同一套 fixture 会带上导出 MD 中的 `source_user_id` 与 `memoir_sections`;「刷新库中章节/故事」拉 DB 快照 →「GLM-5 评审章节」(基线节选与当前成稿一并送评)。 +- `compare_summary`:结构化 A/B 对比摘要,包含 `group_deltas`、关键回落维度、是否出现重复盘问风险,以及 transcript 截断提示。 +- `compare_delta`:原有自由文案流,适合人读;不替代结构化结论。 + +## 评测 Web(`app-eval-web`) + +- **Playground · 分步测评**:选用户导出 MD 为基线 → `eval-sandbox` + 逐轮 `replay/conversation`(**`skip_memoir: true`** 时只做对话)→ **`memoir-submit`** 再可选轮询 **`memoir-phase1-ready`** → 跳转 **Memoir / Stories** 看成稿;支持 **智谱 / DeepSeek R1** 对话流式评分(工具栏「评审模型」)。 +- **Memoir**:按 `user_id` 拉库中章节快照与基线对照评审。 +- **Stories**:故事列表与评审。 ## 真实链路透传回放(与 App 一致) @@ -72,14 +83,36 @@ VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=与上同 npm run dev |------|------|------| | `POST` | `/internal/api/evaluation/sessions/eval-sandbox` | 无 body:新建**临时用户**(`eval_` 伪手机号)+ 空白 `conversation_id` | | `POST` | `/internal/api/evaluation/sessions/replay-bootstrap` | body:`{ "user_id" }`,在已有用户下返回新 `conversation_id` | -| `POST` | `/internal/api/evaluation/replay/conversation` | body:`conversation_id`、`fixture_filename` **或** `user_utterances`;可选 `flush_memoir_after`(默认 true)、`skip_tts`(默认 true)。响应增加 `segment_ids`(本批创建的用户 segment,顺序与处理一致) | -| `GET` | `/internal/api/evaluation/sessions/{conversation_id}/memoir-phase1-ready` | query:`segment_ids` 可重复(`?segment_ids=id1&segment_ids=id2`)。当所列 segment 均已写入 `topic_category`(Phase1 完成)时返回 `ready: true` | +| `POST` | `/internal/api/evaluation/replay/conversation` | body:`conversation_id`、`fixture_filename` **或** `user_utterances`;可选 **`skip_memoir`**(默认 false;为 true 时不 `queue_message`、且不会仅因 `flush_memoir_after` 而 `flush_pending`)、`flush_memoir_after`(默认 true)、`skip_tts`(默认 true)。响应含 `segment_ids`(本批创建的用户 segment) | +| `POST` | `/internal/api/evaluation/sessions/{conversation_id}/memoir-submit` | 无 body:收集本会话内 `topic_category IS NULL` 且 `processed` 为 false 的 segment,调用 `flush_pending(user_id, extra_segment_ids=…)`;返回 `segment_ids`、`celery_task_id` | +| `GET` | `/internal/api/evaluation/sessions/{conversation_id}/memoir-phase1-ready` | query:`segment_ids` 可重复。所列 segment 均已写入 `topic_category` 时 `ready: true` | -每轮等价于 WebSocket 文本路径:`create_user_segment` → `process_user_message`(内部可 `force_skip_tts`)→ `background_runner.queue_message`。 +**默认(`skip_memoir: false`)**:每轮仍相当于主站路径:`create_user_segment` → `process_user_message` → `background_runner.queue_message`;末尾可 `flush_pending`。 -- **TTS**:回放默认 `skip_tts: true`,不在评测台跑语音合成。 -- **Memory / 回忆录管线**:`queue_message` 与末尾 `flush_pending` 依赖 **Celery worker**(`process_memoir_phase1` 等);仅起 internal API 未起 worker 时,对话会落库但章节异步不会推进。 -- **app-eval-web Playground**:默认在**每轮**回放后轮询 `memoir-phase1-ready`,再发送下一轮;单轮等待默认最长 **10 分钟**(环境变量 `VITE_MEMOIR_PHASE1_WAIT_MAX_MS` 覆盖)。需 worker 正常消费任务。可通过「等待 Phase1」勾选关闭以做快速冒烟。中断或 Phase1 超时会将进度写入浏览器 localStorage,可在同一基线下用「继续未完成重放」接续同一 `conversation_id`(含先补完未就绪的 segment)。 +**Playground 分步(`skip_memoir: true` + `flush_memoir_after: false`)**:只做 `create_user_segment` 与 `process_user_message`,**不**入回忆录队列;对话结束后再调 **`memoir-submit`** 统一 flush。 + +- **TTS**:回放默认 `skip_tts: true`。 +- **Celery**:Phase1 / 叙事仍依赖 worker;仅起 HTTP 未起 worker 时,`memoir-submit` 后任务会堆积。 +- **Playground**:第 2 步可选轮询 `memoir-phase1-ready`(前端默认最长约 **10 分钟**,`VITE_MEMOIR_PHASE1_WAIT_MAX_MS` 可覆盖)。中断时本地草稿可「继续未完成重放」接续同一 `conversation_id`(仅对话进度;旧版「每轮等待 Phase1」草稿会被跳过并提示改走 `memoir-submit`)。 + +## A/B 发布口径(追平 A / 超过 A) + +Playground 的结构化摘要里,后端会给出一份 `gate`: + +- `regressed`:仍明显落后 A,或 `context_memory` / `emotion_carry` 等关键项明显回落,或再次出现“重复盘问 / 忽略已答信息”。 +- `parity`:总分基本追平 A,且关键维度未明显退步。 +- `surpass`:总分显著高于 A,同时 `context_memory`、人物建模等关键项不退步,且未出现重复盘问风险。 + +建议发布前不要只看单个 case: + +1. 先固定一组 **黄金样本 fixture**(覆盖童年、求学、职业、家庭、价值观,以及长对话样本)。 +2. 每次 prompt / state / anti-repeat 改动后,用同一组 fixture 全量重放。 +3. 要求整组样本里: + - 不得出现 `regressed` 的受保护样本; + - 大多数样本至少达到 `parity`; + - 目标样本才以 `surpass` 作为升级完成标志。 + +如果 `compare_summary.truncation.*_truncated_for_compare = true`,说明 A/B prose 只看到了截断 transcript;此时应结合逐轮评分与关键样本人工复核,而不要直接用单次 prose 文案做发布决策。 ## 手动 GLM-5(不写 `eval_runs` 表) @@ -93,7 +126,7 @@ VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=与上同 npm run dev **产品与 tier 口径(strict / partial / fallback)、synthetic vs library 分表、PM 对齐规则、backlog** 见同目录 **[traceable-memoir-lineage.md](./traceable-memoir-lineage.md)**。 -手动 `/judge/memoir-chapters` 与自动化 `eval_runs.judge_bundle_json` 已按 **artifact 绑定证据** 组 prompt,而不再默认拼接「最近 N 个会话全文」: +手动 `/judge/memoir-chapters` 与历史自动化 run 的 `judge_bundle_json` 已按 **artifact 绑定证据** 组 prompt,而不再默认拼接「最近 N 个会话全文」: - **`lineage_tier`**:`strict` / `partial` / `fallback`(章节:**有可解析 transcript 链 + 结构化记忆为 strict**;**仅有结构化记忆、无绑定 segment/transcript = partial**,与标注口径一致)。故事侧以 `StoryEvidenceLink` 与章节推导为主;`fallback` = 显式降级最近会话 transcript,避免静默当 strict。 - **`evidence_trace`**:bundle 完整 JSON(segment / conversation / chunk / fact / timeline / summary、`notes` 等)。内审计一般够用;若需按类型深链 UI 再排期。 @@ -111,9 +144,3 @@ VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=与上同 npm run dev - `source_user_id`:导出抬头中的 User ID - `memoir_sections`:`## 回忆录章节(生成正文)` 下按标题切分的基线正文(已去掉 `{{IMAGE:...}}` 占位) -## 门禁规则(v1) - -- 所有 case 的合成均分:候选须 **严格高于** 基线。 -- `is_protected=true` 的 case:合成份跌幅不得超过 `EVAL_GATE_PROTECTED_REGRESSION_THRESHOLD`(默认 2 分)。 - -结果写入 `eval_gate_verdicts`,不影响 `git`;后续可接 pre-commit / CI。 diff --git a/api/tests/evaluation/test_internal_router_auth.py b/api/tests/evaluation/test_internal_router_auth.py index 1066afb..3d6b7b2 100644 --- a/api/tests/evaluation/test_internal_router_auth.py +++ b/api/tests/evaluation/test_internal_router_auth.py @@ -4,11 +4,12 @@ import pytest from httpx import ASGITransport, AsyncClient from app.features.evaluation.internal_auth import get_internal_eval_principal -from app.features.evaluation.router import router @pytest.mark.asyncio -async def test_internal_eval_list_sets_requires_config(monkeypatch: pytest.MonkeyPatch): +async def test_internal_eval_list_fixtures_requires_config( + monkeypatch: pytest.MonkeyPatch, +): from fastapi import FastAPI monkeypatch.setattr( @@ -16,16 +17,20 @@ async def test_internal_eval_list_sets_requires_config(monkeypatch: pytest.Monke "", raising=False, ) + from app.features.evaluation.router import router + app = FastAPI() app.include_router(router, prefix="/internal/api/evaluation") transport = ASGITransport(app=app) async with AsyncClient(transport=transport, base_url="http://t") as client: - r = await client.get("/internal/api/evaluation/regression-sets") + r = await client.get("/internal/api/evaluation/fixtures/user-exports") assert r.status_code == 503 @pytest.mark.asyncio -async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.MonkeyPatch): +async def test_internal_eval_with_override_lists_fixtures( + monkeypatch: pytest.MonkeyPatch, +): from fastapi import FastAPI monkeypatch.setattr( @@ -33,6 +38,17 @@ async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.Monke "secret", raising=False, ) + def _empty_fixtures() -> list[str]: + return [] + + monkeypatch.setattr( + "app.features.evaluation.admin_service.list_user_export_md_filenames", + _empty_fixtures, + raising=False, + ) + + from app.features.evaluation.router import router + app = FastAPI() app.include_router(router, prefix="/internal/api/evaluation") @@ -42,24 +58,12 @@ async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.Monke return InternalEvalPrincipal() app.dependency_overrides[get_internal_eval_principal] = _override_auth - from app.core.db import get_async_db - from unittest.mock import AsyncMock, MagicMock - - mock_session = AsyncMock() - mock_result = MagicMock() - mock_result.scalars.return_value.unique.return_value.all.return_value = [] - mock_session.execute = AsyncMock(return_value=mock_result) - - async def _db(): - yield mock_session - - app.dependency_overrides[get_async_db] = _db transport = ASGITransport(app=app) async with AsyncClient(transport=transport, base_url="http://t") as client: r = await client.get( - "/internal/api/evaluation/regression-sets", + "/internal/api/evaluation/fixtures/user-exports", headers={"X-Internal-Eval-Key": "secret"}, ) assert r.status_code == 200 - assert r.json() == [] + assert r.json() == {"items": []} diff --git a/api/tests/test_eval_composite.py b/api/tests/test_eval_composite.py index 94a034e..d9e1ece 100644 --- a/api/tests/test_eval_composite.py +++ b/api/tests/test_eval_composite.py @@ -1,6 +1,6 @@ """评测合成分:评审缺失侧不得被当作 0 分。""" -from app.features.evaluation.execution_service import _composite +from app.features.evaluation.composite_score import _composite def test_composite_none_when_both_missing() -> None: diff --git a/api/tests/test_eval_judge_llm_spec.py b/api/tests/test_eval_judge_llm_spec.py new file mode 100644 index 0000000..6e3482d --- /dev/null +++ b/api/tests/test_eval_judge_llm_spec.py @@ -0,0 +1,53 @@ +"""评测评审 LLM 装配:多供应商与上下文预算。""" + +import pytest + +from app.core.config import settings +from app.core.dependencies import build_eval_judge_llm_spec +from app.features.evaluation.judge_service import ( + eval_judge_compare_transcript_each_max_chars_for_context, + eval_judge_conversation_transcript_max_chars_for_context, +) + + +def test_build_eval_judge_zhipu_uses_bigmodel_defaults(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "eval_judge_api_key", "") + monkeypatch.setattr(settings, "zhipu_api_key", "z-test") + monkeypatch.setattr(settings, "eval_judge_model", "glm-5") + spec = build_eval_judge_llm_spec("zhipu", None) + assert spec is not None + assert spec.provider == "zhipu" + assert spec.resolved_model == "glm-5" + assert spec.llm is not None + assert spec.context_window_tokens == settings.eval_judge_context_window_tokens + + +def test_build_eval_judge_zhipu_request_model_override(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "eval_judge_api_key", "e-test") + monkeypatch.setattr(settings, "eval_judge_model", "glm-5") + spec = build_eval_judge_llm_spec("zhipu", "glm-4-plus") + assert spec is not None + assert spec.resolved_model == "glm-4-plus" + + +def test_build_eval_judge_deepseek_requires_key(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "deepseek_api_key", "") + monkeypatch.setattr(settings, "llm_api_key", "") + assert build_eval_judge_llm_spec("deepseek", None) is None + + +def test_build_eval_judge_deepseek_context_budget(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "deepseek_api_key", "d-test") + monkeypatch.setattr(settings, "eval_judge_deepseek_model", "deepseek-reasoner") + monkeypatch.setattr(settings, "eval_judge_deepseek_context_window_tokens", 64_000) + spec = build_eval_judge_llm_spec("deepseek", None) + assert spec is not None + assert spec.provider == "deepseek" + assert spec.resolved_model == "deepseek-reasoner" + assert spec.context_window_tokens == 64_000 + n = eval_judge_conversation_transcript_max_chars_for_context(64_000) + glm_n = eval_judge_conversation_transcript_max_chars_for_context(200_000) + assert n < glm_n + each_ds = eval_judge_compare_transcript_each_max_chars_for_context(64_000) + each_glm = eval_judge_compare_transcript_each_max_chars_for_context(200_000) + assert each_ds < each_glm diff --git a/api/tests/test_interview_prompts.py b/api/tests/test_interview_prompts.py index ec4cb6d..def0aa2 100644 --- a/api/tests/test_interview_prompts.py +++ b/api/tests/test_interview_prompts.py @@ -2,6 +2,11 @@ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage +from app.agents.chat.interview_state_hints import ( + apply_duplicate_question_guard, + extract_scene_cues, +) +from app.agents.state_schema import KnownFact, MemoirStateSchema, PersonaThread, default_slots from app.agents.chat.helpers import format_history_string from app.agents.chat.personas import normalize_interview_persona from app.agents.chat.prompts_conversation import ( @@ -32,7 +37,7 @@ def test_guided_prompt_does_not_embed_raw_user_message_in_system_text(): assert "__USER_SECRET_PROFILE__" in p2 -def test_guided_prompt_mentions_empathy_and_self_judgment(): +def test_guided_prompt_mentions_empathy_and_scene_strategy(): p = get_guided_conversation_prompt( current_stage="childhood", empty_slots=["place"], @@ -41,10 +46,10 @@ def test_guided_prompt_mentions_empathy_and_self_judgment(): user_profile_context="", persona="default", ) - assert "接住对方" in p - assert "你自己判断" in p or "该追问" in p - assert "共情与轻量自我表露" in p - assert "意义向深挖" in p + assert "接住" in p + assert "画面" in p or "细节" in p + assert "深挖" in p + assert "串联" in p def test_guided_prompt_era_popculture_open_questions_when_birth_year(): @@ -132,6 +137,85 @@ def test_guided_prompt_military_tone_in_system(): assert "简洁" in p or "利落" in p or "得体" in p +def test_guided_prompt_includes_known_facts_persona_threads_and_recent_questions(): + p = get_guided_conversation_prompt( + current_stage="career", + empty_slots=["job", "decision"], + filled_slots={"growth": "越做越确定自己适合产品"}, + detected_user_stage="career", + user_profile_context="", + persona="default", + known_facts=[ + KnownFact(label="本轮新信息", value="我后来去了瑞士读书", stage="education"), + ], + persona_threads=[ + PersonaThread(trait="执着坚持", evidence="为了训练咬牙坚持了很多年"), + ], + recent_questions=["你当时为什么会想去瑞士?"], + ) + assert "已确认事实" in p + assert "我后来去了瑞士读书" in p + assert "人物主线" in p + assert "执着坚持" in p + assert "最近已经问过的问题" in p + assert "为什么会想去瑞士" in p + + +def test_prompt_empty_slots_excludes_slots_already_covered_by_known_facts(): + state = MemoirStateSchema( + stage_order=["education"], + current_stage="education", + covered_stages=[], + slots={"education": default_slots()["education"]}, + known_facts=[ + KnownFact( + label="求学城市", + value="后来在瑞士读书", + stage="education", + slot_name="city", + ) + ], + ) + assert "city" not in state.prompt_empty_slots_for_current_stage() + assert "school" in state.prompt_empty_slots_for_current_stage() + + +def test_duplicate_question_guard_downgrades_recent_repeat_question(): + state = MemoirStateSchema( + stage_order=["education"], + current_stage="education", + covered_stages=[], + slots={"education": default_slots()["education"]}, + known_facts=[ + KnownFact(label="本轮新信息", value="我后来去了瑞士读书", stage="education") + ], + ) + cleaned, touched = apply_duplicate_question_guard( + ["我记住了。你后来去了瑞士读书吗?"], + state=state, + recent_questions=["你后来去了瑞士读书吗?"], + ) + assert touched is True + assert cleaned == ["我记住了。"] + + +def test_extract_scene_cues_picks_up_sensory_keywords(): + cues = extract_scene_cues("我们小时候在河里游泳,冬天溜冰") + assert any("凉" in c or "水" in c for c in cues) + assert any("冰" in c or "咔嚓" in c for c in cues) + + +def test_extract_scene_cues_empty_for_abstract_text(): + assert extract_scene_cues("我觉得人生需要坚持") == [] + + +def test_default_persona_now_has_tone_hint(): + from app.agents.chat.personas import get_interview_persona_tone_hint + hint = get_interview_persona_tone_hint("default") + assert hint + assert "画面" in hint or "细节" in hint + + def test_opening_prompt_military_style_rules_not_dialogue_samples() -> None: p = get_opening_prompt( current_stage="childhood", diff --git a/api/tests/test_judge_service.py b/api/tests/test_judge_service.py index b0fd6fe..b6d6aee 100644 --- a/api/tests/test_judge_service.py +++ b/api/tests/test_judge_service.py @@ -4,6 +4,9 @@ import pytest from app.core.config import settings from app.core.llm_call import LLMCallError +from app.features.evaluation.conversation_compare_summary import ( + build_conversation_compare_summary, +) from app.features.evaluation.judge_schemas import ConversationJudgeOutput from app.features.evaluation.judge_service import ( EvalJudgeService, @@ -36,6 +39,12 @@ def _conversation_payload() -> dict: } +def _conversation_payload_variant(**overrides: float | str) -> dict: + data = _conversation_payload() + data.update(overrides) + return data + + @pytest.mark.asyncio async def test_judge_conversation_result_preserves_validation_error( monkeypatch: pytest.MonkeyPatch, @@ -116,3 +125,63 @@ def test_build_memoir_prompt_requires_conservative_scoring_without_evidence() -> assert "无可用局部对话证据" in prompt assert "必须保守打分" in prompt assert "【结构化记忆证据】" in prompt + + +def test_compare_summary_surpass_gate_and_truncation_flags() -> None: + baseline = ConversationJudgeOutput.model_validate(_conversation_payload()) + replay = ConversationJudgeOutput.model_validate( + _conversation_payload_variant( + emotion_carry=10, + empathy_depth=8, + emotion_safety=6, + emotion_guidance=6, + fact_mining=8, + info_completeness_guide=8, + info_depth_mining=9, + persona_understanding=7, + persona_consistency_verify=4, + persona_expression_guide=4, + interview_structure=6, + context_memory=5, + rhythm_control=4, + question_quality=7, + follow_up_depth=5, + non_leading=3, + rationale="更稳定。", + ) + ) + summary = build_conversation_compare_summary( + baseline_judge=baseline, + replay_judge=replay, + baseline_transcript="A" * 400, + replay_transcript="B" * 1200, + conv_cap=1000, + compare_cap_each=500, + fixture_filename="golden.md", + ) + assert summary["mode"] == "ab" + assert summary["gate"]["status"] in {"parity", "surpass"} + assert summary["truncation"]["replay_truncated_for_compare"] is True + assert "group_deltas" in summary + + +def test_compare_summary_flags_repeat_issue_as_regression() -> None: + baseline = ConversationJudgeOutput.model_validate(_conversation_payload()) + replay = ConversationJudgeOutput.model_validate( + _conversation_payload_variant( + context_memory=3, + rhythm_control=3, + total_score=0, + major_issues=["存在重复盘问,忽略已答信息"], + ) + ) + summary = build_conversation_compare_summary( + baseline_judge=baseline, + replay_judge=replay, + baseline_transcript="[Turn 1]", + replay_transcript="[Turn 1]", + conv_cap=1000, + compare_cap_each=500, + ) + assert summary["repeat_issue_detected"] is True + assert summary["gate"]["status"] == "regressed" diff --git a/api/tests/test_memoir_pipeline_optimization.py b/api/tests/test_memoir_pipeline_optimization.py new file mode 100644 index 0000000..9c8896b --- /dev/null +++ b/api/tests/test_memoir_pipeline_optimization.py @@ -0,0 +1,242 @@ +"""Validation tests for memoir pipeline optimization (Phase A/B/C). + +Tests: +- Phase1 batch path is now the default +- Memory enrichment is dispatched asynchronously +- Unified narrative unit executor produces correct results +- Post-commit fan-out includes quality pass +- Quality pass task handles title polishing +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from app.agents.memoir.extraction_agent import ExtractionResult +from app.agents.memoir.classification_agent import ChapterClassifyResult +from app.agents.memoir.orchestrator import MemoirOrchestrator +from app.agents.state_schema import MemoirStateSchema + + +# --------------------------------------------------------------------------- +# Phase1 batch path defaults +# --------------------------------------------------------------------------- + +def test_phase1_batch_enabled_by_default() -> None: + """memoir_phase1_batch_llm_enabled should default to True after optimization.""" + from app.core.config import Settings + + s = Settings() + assert s.memoir_phase1_batch_llm_enabled is True + + +def test_quality_pass_enabled_by_default() -> None: + from app.core.config import Settings + + s = Settings() + assert s.memoir_quality_pass_enabled is True + + +# --------------------------------------------------------------------------- +# Phase1 orchestrator selects batch path when available +# --------------------------------------------------------------------------- + +def test_orchestrator_tries_batch_first(monkeypatch: pytest.MonkeyPatch) -> None: + """When batch LLM is enabled and LLM is available, batch path should be attempted.""" + monkeypatch.setattr( + "app.agents.memoir.orchestrator.settings.memoir_phase1_batch_llm_enabled", + True, + ) + + orch = MemoirOrchestrator() + batch_called = {"flag": False} + + def fake_batch(*args, **kwargs): + batch_called["flag"] = True + return MagicMock( + state=MemoirStateSchema( + stage_order=["childhood"], + current_stage="childhood", + covered_stages=[], + slots={}, + ), + category_to_segments={}, + segment_skip_story_ids=set(), + segment_chapter_category={}, + ) + + orch._prepare_batches_via_batch_llm = fake_batch + + class _Seg: + def __init__(self, sid: str) -> None: + self.id = sid + self.user_input_text = "test" + + st = MemoirStateSchema( + stage_order=["childhood"], + current_stage="childhood", + covered_stages=[], + slots={}, + ) + orch.prepare_batches( + segments=[_Seg("s1")], + llm=MagicMock(), + llm_fast=MagicMock(), + get_or_create_state=lambda: st, + update_slot=lambda *a: st, + ) + assert batch_called["flag"] is True + + +def test_orchestrator_fallback_to_sequential(monkeypatch: pytest.MonkeyPatch) -> None: + """If batch path raises, should fall back to sequential extraction.""" + monkeypatch.setattr( + "app.agents.memoir.orchestrator.settings.memoir_phase1_batch_llm_enabled", + True, + ) + + orch = MemoirOrchestrator() + + def fail_batch(*args, **kwargs): + raise RuntimeError("batch LLM unavailable") + + orch._prepare_batches_via_batch_llm = fail_batch + orch.extraction_agent.extract = MagicMock( + return_value=ExtractionResult(detected_stage="childhood", slots={"toy": "ball"}) + ) + orch.classification_agent.classify = MagicMock( + return_value=ChapterClassifyResult(category="childhood", llm_said_none=False) + ) + + st = MemoirStateSchema( + stage_order=["childhood"], + current_stage="childhood", + covered_stages=[], + slots={}, + ) + + class _Seg: + def __init__(self, sid: str, text: str) -> None: + self.id = sid + self.user_input_text = text + + result = orch.prepare_batches( + segments=[_Seg("s1", "我小时候玩球")], + llm=MagicMock(), + llm_fast=MagicMock(), + get_or_create_state=lambda: st, + update_slot=lambda *a: st, + ) + assert "s1" in result.segment_chapter_category + + +# --------------------------------------------------------------------------- +# Memory enrichment decoupled from ingest +# --------------------------------------------------------------------------- + +def test_ingest_transcript_sync_no_longer_calls_enrichment_inline() -> None: + """After decoupling, ingest_transcript_sync should NOT import enrichment inline.""" + import inspect + from app.features.memory.service import ingest_transcript_sync + + source = inspect.getsource(ingest_transcript_sync) + assert "enrich_memory_after_ingest_sync" not in source + assert "enrich_memory_source" in source + + +# --------------------------------------------------------------------------- +# Post-commit unified fan-out +# --------------------------------------------------------------------------- + +def test_post_commit_result_includes_quality_pass() -> None: + """PostCommitResult should have quality_pass_scheduled field.""" + from app.features.story.post_commit import PostCommitResult + + r = PostCommitResult() + assert hasattr(r, "quality_pass_scheduled") + assert r.quality_pass_scheduled is False + + +def test_post_commit_signature_accepts_quality_pass() -> None: + """enqueue_story_post_commit_effects should accept need_quality_pass kwarg.""" + import inspect + from app.features.story.post_commit import enqueue_story_post_commit_effects + + sig = inspect.signature(enqueue_story_post_commit_effects) + assert "need_quality_pass" in sig.parameters + assert "memoir_correlation_id" in sig.parameters + + +# --------------------------------------------------------------------------- +# resolve_append_target +# --------------------------------------------------------------------------- + +def test_resolve_append_target_forced_new_on_overflow() -> None: + """When canonical exceeds limit, should force new story.""" + from app.features.memoir.story_pipeline_sync import _resolve_append_target + + session = MagicMock() + big_story = MagicMock() + big_story.user_id = "u1" + big_story.id = "story-1" + big_story.canonical_markdown = "x" * 200_000 + session.get.return_value = big_story + + with patch( + "app.features.memoir.story_pipeline_sync.count_story_versions_sync", + return_value=1, + ): + tid, existing, dsrc = _resolve_append_target( + session, + route_decision="append_story", + route_target_story_id="story-1", + user_id="u1", + chapter_category="childhood", + oral_norm="short text", + candidate_stories=[], + story_meta={}, + decision_source="test", + memoir_correlation_id=None, + ) + assert tid is None + assert dsrc == "forced_new_due_to_append_limit" + + +# --------------------------------------------------------------------------- +# _run_post_pipeline_commit helper +# --------------------------------------------------------------------------- + +def test_run_post_pipeline_commit_calls_post_commit() -> None: + """Shared helper should call enqueue_story_post_commit_effects.""" + from app.tasks.memoir_tasks import _run_post_pipeline_commit + + with patch( + "app.features.story.post_commit.enqueue_story_post_commit_effects" + ) as mock_pc, patch( + "app.features.memoir.memoir_images.settings.MemoirImageSettings" + ) as mock_img: + mock_pc.return_value = MagicMock( + enqueued_story_image_count=0, + enqueued_chapter_recompose_count=0, + compaction_scheduled=False, + quality_pass_scheduled=True, + errors=[], + ) + mock_img.from_env.return_value = MagicMock(enabled=False) + + _run_post_pipeline_commit( + user_id="u1", + story_dispatch_ids={"s1"}, + recompose_chapter_ids={"c1"}, + cover_chapter_ids=set(), + trigger_source="test", + need_compaction=False, + need_quality_pass=True, + memoir_correlation_id="cid-1", + ) + mock_pc.assert_called_once() + call_kwargs = mock_pc.call_args + assert call_kwargs.kwargs["need_quality_pass"] is True + assert call_kwargs.kwargs["memoir_correlation_id"] == "cid-1" diff --git a/app-eval-web/README.md b/app-eval-web/README.md index 8642dcc..d0eee4a 100644 --- a/app-eval-web/README.md +++ b/app-eval-web/README.md @@ -1,6 +1,6 @@ # 内部评测 Web(Life Echo) -独立 Vite + React 控制台,对接 `app.internal_main:internal_app`。 +独立 Vite + React 控制台,对接 `app.internal_main:internal_app`。路由仅 **Playground(分步测评)**、**Memoir**、**Memoir · Stories**:先对话重放(`skip_memoir`)→ `memoir-submit` → 查看成稿。 ## 环境变量 @@ -22,4 +22,4 @@ VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=your-secret npm run d npm run build ``` -产物在 `dist/`,可挂任意静态服务器。SSE 使用 `?key=` 传评测密钥(见后端文档)。 +产物在 `dist/`,可挂任意静态服务器。对话流式评审使用带 `X-Internal-Eval-Key` 的 `fetch`(见 `api/docs/internal-eval.md`)。 diff --git a/app-eval-web/src/App.tsx b/app-eval-web/src/App.tsx index 9bf9dd2..4fc071b 100644 --- a/app-eval-web/src/App.tsx +++ b/app-eval-web/src/App.tsx @@ -7,23 +7,14 @@ import { Sidebar } from "./components/Sidebar"; import { useHashRoute } from "./hooks/useHashRoute"; import { useNotices } from "./hooks/useNotices"; import type { AppRoute } from "./types"; -import DatasetsPage from "./pages/DatasetsPage"; -import ExperimentsPage from "./pages/ExperimentsPage"; import MemoirPage from "./pages/MemoirPage"; import MemoirStoriesPage from "./pages/MemoirStoriesPage"; import PlaygroundPage from "./pages/PlaygroundPage"; -import VersionsPage from "./pages/VersionsPage"; function RouteOutlet({ route }: { route: AppRoute }) { switch (route) { case "playground": return ; - case "datasets": - return ; - case "experiments": - return ; - case "versions": - return ; case "memoir": return ; case "memoir-stories": diff --git a/app-eval-web/src/api.ts b/app-eval-web/src/api.ts index 8e8074d..1700ccc 100644 --- a/app-eval-web/src/api.ts +++ b/app-eval-web/src/api.ts @@ -50,12 +50,3 @@ export async function api( }; } } - -export function experimentStreamUrl(experimentId: string): string { - const path = `/internal/api/evaluation/experiments/${encodeURIComponent(experimentId)}/stream`; - const base = `${apiBase}${path}`; - if (apiKey) { - return `${base}?key=${encodeURIComponent(apiKey)}`; - } - return base; -} diff --git a/app-eval-web/src/components/DiffTable.tsx b/app-eval-web/src/components/DiffTable.tsx deleted file mode 100644 index d90d65a..0000000 --- a/app-eval-web/src/components/DiffTable.tsx +++ /dev/null @@ -1,196 +0,0 @@ -import { useState } from "react"; -import type { EvalRunOut } from "../types"; -import { JsonPreview } from "./JsonPreview"; -import { MemoirScoreSummary } from "./ScoreCard"; - -function isRecord(v: unknown): v is Record { - return typeof v === "object" && v !== null && !Array.isArray(v); -} - -function judgeBundleHasMemoirLists(bundle: unknown): bundle is Record { - if (!isRecord(bundle)) return false; - const lists = [ - bundle.chapter_results, - bundle.chapters, - bundle.story_results, - bundle.stories, - ]; - return lists.some((x) => Array.isArray(x) && x.length > 0); -} - -type PairRow = { - caseId: string; - title: string; - baseline: number | null; - candidate: number | null; - delta: number | null; - bRun: EvalRunOut | null; - cRun: EvalRunOut | null; -}; - -function buildRows(runs: EvalRunOut[]): PairRow[] { - const byCase = new Map(); - for (const r of runs) { - const side = r.side.toLowerCase(); - let e = byCase.get(r.case_id); - if (!e) { - e = {}; - byCase.set(r.case_id, e); - } - if (side === "baseline") e.b = r; - if (side === "candidate") e.c = r; - } - const rows: PairRow[] = []; - for (const [caseId, { b, c }] of byCase) { - const baseline = b?.composite_score ?? null; - const candidate = c?.composite_score ?? null; - let delta: number | null = null; - if (baseline != null && candidate != null) { - delta = candidate - baseline; - } - rows.push({ - caseId, - title: - b?.judge_bundle_json && typeof b.judge_bundle_json === "object" - ? String( - (b.judge_bundle_json as { case_title?: string }).case_title ?? - "", - ) || caseId.slice(0, 8) - : caseId.slice(0, 8) + "…", - baseline, - candidate, - delta, - bRun: b ?? null, - cRun: c ?? null, - }); - } - return rows.sort((a, b) => a.caseId.localeCompare(b.caseId)); -} - -function rowStatus(delta: number | null): { label: string; cls: string } { - if (delta == null) return { label: "—", cls: "eval-diff__cell--na" }; - if (delta > 0.5) return { label: "改善", cls: "eval-diff__cell--up" }; - if (delta < -0.5) return { label: "退步", cls: "eval-diff__cell--down" }; - return { label: "持平", cls: "eval-diff__cell--flat" }; -} - -export function DiffTable({ runs }: { runs: EvalRunOut[] }) { - const rows = buildRows(runs); - const [openId, setOpenId] = useState(null); - - if (rows.length === 0) { - return ( -

暂无 run 数据(实验运行完成后会显示对比)

- ); - } - - return ( -
- - - - - - - - - - - - {rows.map((r) => { - const st = rowStatus(r.delta); - const active = openId === r.caseId; - return ( - - - - - - - - ); - })} - -
用例基线综合分候选综合分差值结论
- - {r.baseline != null ? r.baseline.toFixed(2) : "—"}{r.candidate != null ? r.candidate.toFixed(2) : "—"} - {r.delta != null ? (r.delta > 0 ? "+" : "") + r.delta.toFixed(2) : "—"} - {st.label}
- - {openId ? ( -
- {(() => { - const row = rows.find((x) => x.caseId === openId); - if (!row) return null; - return ( - <> -

Run 详情

-
-
-
baseline
- {row.bRun ? ( - <> - {judgeBundleHasMemoirLists(row.bRun.judge_bundle_json) ? ( - <> -
- 回忆录证据 / 章节评审 -
- - - ) : null} -
- 完整 run -
- - - ) : ( - - )} -
-
-
candidate
- {row.cRun ? ( - <> - {judgeBundleHasMemoirLists(row.cRun.judge_bundle_json) ? ( - <> -
- 回忆录证据 / 章节评审 -
- - - ) : null} -
- 完整 run -
- - - ) : ( - - )} -
-
- - ); - })()} -
- ) : null} -
- ); -} diff --git a/app-eval-web/src/components/ScoreCard.tsx b/app-eval-web/src/components/ScoreCard.tsx index b0ce9ec..6e74d03 100644 --- a/app-eval-web/src/components/ScoreCard.tsx +++ b/app-eval-web/src/components/ScoreCard.tsx @@ -8,6 +8,10 @@ const CONV_GROUPS: { key: string; label: string }[] = [ { key: "question_score", label: "提问质量" }, ]; +function formatSigned(v: number): string { + return `${v > 0 ? "+" : ""}${v.toFixed(1)}`; +} + function isRecord(v: unknown): v is Record { return typeof v === "object" && v !== null && !Array.isArray(v); } @@ -236,6 +240,80 @@ export function ScoreCard({ ); } +export function ConversationCompareSummary({ summary }: { summary: unknown }) { + if (!isRecord(summary)) return null; + const gate = isRecord(summary.gate) ? summary.gate : null; + const truncation = isRecord(summary.truncation) ? summary.truncation : null; + const groupDeltas = isRecord(summary.group_deltas) ? summary.group_deltas : null; + const totalDelta = + typeof summary.total_delta === "number" ? summary.total_delta : null; + const baselineTotal = + typeof summary.baseline_total === "number" ? summary.baseline_total : null; + const replayTotal = + typeof summary.replay_total === "number" ? summary.replay_total : null; + const gateStatus = typeof gate?.status === "string" ? gate.status : ""; + const reasons = Array.isArray(gate?.reasons) + ? gate.reasons.map((x) => String(x)).filter(Boolean) + : []; + const rows = groupDeltas + ? Object.entries(groupDeltas).filter(([, raw]) => isRecord(raw)) + : []; + return ( +
+
+ 结构化对比结论 + + {gateStatus === "surpass" + ? "超过 A" + : gateStatus === "parity" + ? "基本追平" + : gateStatus === "regressed" + ? "仍落后 A" + : "待判定"} + +
+ {baselineTotal != null && replayTotal != null && totalDelta != null ? ( +

+ 基线 {baselineTotal.toFixed(1)} 分,实际 {replayTotal.toFixed(1)} 分,差值{" "} + {formatSigned(totalDelta)} +

+ ) : null} + {rows.length ? ( +
    + {rows.map(([key, raw]) => { + const row = raw as Record; + const label = typeof row.label === "string" ? row.label : key; + const delta = typeof row.delta === "number" ? row.delta : null; + if (delta == null) return null; + return ( +
  • + {label} + {formatSigned(delta)} +
  • + ); + })} +
+ ) : null} + {reasons.length ? ( +
    + {reasons.map((reason, idx) => ( +
  • {reason}
  • + ))} +
+ ) : null} + {truncation ? ( +

+ 截断提示: + {truncation.baseline_truncated_for_compare === true || + truncation.replay_truncated_for_compare === true + ? " A/B 对比稿已发生 transcript 截断。" + : " A/B 对比稿未发生 transcript 截断。"} +

+ ) : null} +
+ ); +} + /** 手工评审 API 用 chapter_results/story_results;自动化 run 的 judge_bundle_json 用 chapters/stories。 */ function pickMemoirChapterList(data: Record): unknown[] { const manual = data.chapter_results; @@ -259,7 +337,7 @@ export function MemoirScoreSummary({ showRawJson = true, }: { data: unknown; - /** 为 false 时仅渲染结构化章节/故事块(供 DiffTable 等外层再贴完整 run JSON) */ + /** 为 false 时仅渲染结构化章节/故事块(供外层再贴完整 JSON) */ showRawJson?: boolean; }) { if (!isRecord(data)) { diff --git a/app-eval-web/src/components/Sidebar.tsx b/app-eval-web/src/components/Sidebar.tsx index 2123d47..3ff4973 100644 --- a/app-eval-web/src/components/Sidebar.tsx +++ b/app-eval-web/src/components/Sidebar.tsx @@ -1,10 +1,7 @@ import type { AppRoute } from "../types"; const NAV: { route: AppRoute; label: string; sub?: string }[] = [ - { route: "playground", label: "Playground", sub: "交互测试" }, - { route: "datasets", label: "Datasets", sub: "数据集" }, - { route: "experiments", label: "Experiments", sub: "实验" }, - { route: "versions", label: "Versions", sub: "模型版本" }, + { route: "playground", label: "Playground", sub: "分步测评" }, { route: "memoir", label: "Memoir", sub: "章节对照" }, { route: "memoir-stories", label: "Stories", sub: "故事成稿" }, ]; diff --git a/app-eval-web/src/eval.css b/app-eval-web/src/eval.css index c63d314..fe3e512 100644 --- a/app-eval-web/src/eval.css +++ b/app-eval-web/src/eval.css @@ -912,6 +912,97 @@ code { max-width: 72ch; } +.eval-wizard { + margin: var(--s-4) 0; +} + +.eval-wizard__steps { + list-style: none; + margin: 0; + padding: 0; + display: flex; + flex-wrap: wrap; + gap: var(--s-2); +} + +.eval-wizard__step { + display: inline-flex; + align-items: center; + gap: var(--s-2); + padding: var(--s-2) var(--s-3); + border-radius: var(--r-lg); + border: 1px solid var(--border); + background: var(--bg-elevated); + font-size: var(--text-sm); + color: var(--text-muted); +} + +.eval-wizard__step--current { + border-color: var(--link); + color: var(--text); + box-shadow: 0 0 0 1px color-mix(in srgb, var(--link) 35%, transparent); +} + +.eval-wizard__step--done { + border-color: color-mix(in srgb, var(--baseline-label) 50%, var(--border)); + color: var(--baseline-label); +} + +.eval-wizard__num { + display: inline-flex; + align-items: center; + justify-content: center; + width: 1.5rem; + height: 1.5rem; + border-radius: 999px; + font-size: var(--text-xs); + font-weight: 600; + background: var(--border); + color: var(--text); +} + +.eval-wizard__step--current .eval-wizard__num { + background: var(--link); + color: var(--bg); +} + +.eval-wizard__step--done .eval-wizard__num { + background: color-mix(in srgb, var(--baseline-label) 25%, var(--border)); +} + +.eval-wizard-panel { + margin: var(--s-4) 0 var(--s-3); + padding: var(--s-3) var(--s-4); + border: 1px solid var(--border); + border-radius: var(--r-lg); + background: color-mix(in srgb, var(--bg-elevated) 88%, var(--border)); +} + +.eval-wizard-panel__title { + margin: 0 0 var(--s-2); + font-size: var(--text-md); + font-weight: 600; + display: flex; + flex-wrap: wrap; + align-items: baseline; + gap: var(--s-2); +} + +.eval-wizard-panel__kicker { + font-size: var(--text-xs); + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.04em; + color: var(--link); +} + +.eval-wizard-panel__lede { + margin: 0; + font-size: var(--text-sm); + line-height: 1.55; + max-width: 72ch; +} + .eval-toolbar { display: flex; flex-wrap: wrap; diff --git a/app-eval-web/src/hooks/useExperimentStream.ts b/app-eval-web/src/hooks/useExperimentStream.ts deleted file mode 100644 index 734fd8c..0000000 --- a/app-eval-web/src/hooks/useExperimentStream.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { useEffect, useRef } from "react"; -import { experimentStreamUrl } from "../api"; - -export type ExperimentStreamPayload = { - experiment_id?: string; - status?: string; - runs?: unknown[]; - gate?: unknown; - error?: string; -}; - -/** GET SSE:实验进度快照(约 1s 一条直至 completed/failed) */ -export function useExperimentStream( - experimentId: string | null, - enabled: boolean, - onPayload: (payload: ExperimentStreamPayload) => void, -) { - const onPayloadRef = useRef(onPayload); - onPayloadRef.current = onPayload; - - useEffect(() => { - if (!enabled || !experimentId) return; - - const url = experimentStreamUrl(experimentId); - const es = new EventSource(url); - - es.onmessage = (ev) => { - try { - const payload = JSON.parse(ev.data) as ExperimentStreamPayload; - if (payload.error === "not_found") { - es.close(); - return; - } - onPayloadRef.current(payload); - if (payload.status === "completed" || payload.status === "failed") { - es.close(); - } - } catch { - /* ignore parse errors */ - } - }; - - es.onerror = () => { - es.close(); - }; - - return () => es.close(); - }, [experimentId, enabled]); -} diff --git a/app-eval-web/src/hooks/useHashRoute.ts b/app-eval-web/src/hooks/useHashRoute.ts index 9c8d6f6..674e834 100644 --- a/app-eval-web/src/hooks/useHashRoute.ts +++ b/app-eval-web/src/hooks/useHashRoute.ts @@ -1,14 +1,7 @@ import { useCallback, useEffect, useState } from "react"; import type { AppRoute } from "../types"; -const ROUTES: AppRoute[] = [ - "playground", - "datasets", - "experiments", - "versions", - "memoir", - "memoir-stories", -]; +const ROUTES: AppRoute[] = ["playground", "memoir", "memoir-stories"]; function parseHash(): AppRoute { const raw = window.location.hash.slice(1).split("?")[0] || "playground"; @@ -28,7 +21,7 @@ export function useHashRoute(): [AppRoute, (r: AppRoute) => void] { }, []); const setRoute = useCallback((r: AppRoute) => { - if (window.location.hash.slice(1) !== r) { + if (window.location.hash.slice(1).split("?")[0] !== r) { window.location.hash = r; } }, []); diff --git a/app-eval-web/src/hooks/usePolling.ts b/app-eval-web/src/hooks/usePolling.ts deleted file mode 100644 index b817f6c..0000000 --- a/app-eval-web/src/hooks/usePolling.ts +++ /dev/null @@ -1,20 +0,0 @@ -import { useEffect, useRef } from "react"; - -/** 定時拉取;enabled 为 false 时不轮询。首次进入会立即执行一次。 */ -export function usePolling( - callback: () => void | Promise, - intervalMs: number, - enabled: boolean, -) { - const cbRef = useRef(callback); - cbRef.current = callback; - - useEffect(() => { - if (!enabled) return; - - const run = () => void cbRef.current(); - void run(); - const t = setInterval(run, intervalMs); - return () => clearInterval(t); - }, [enabled, intervalMs]); -} diff --git a/app-eval-web/src/pages/DatasetsPage.tsx b/app-eval-web/src/pages/DatasetsPage.tsx deleted file mode 100644 index 0230500..0000000 --- a/app-eval-web/src/pages/DatasetsPage.tsx +++ /dev/null @@ -1,301 +0,0 @@ -import { useCallback, useEffect, useState } from "react"; -import { api } from "../api"; -import { usePushNotice } from "../context/NoticeContext"; -import { EmptyState } from "../components/EmptyState"; -import { formatTime } from "../utils/formatTime"; -import type { CaseOut, RegressionSetOut } from "../types"; - -export default function DatasetsPage() { - const pushNotice = usePushNotice(); - const [sets, setSets] = useState([]); - const [selSetId, setSelSetId] = useState(null); - const [cases, setCases] = useState([]); - const [newSetName, setNewSetName] = useState("新数据集"); - const [importMd, setImportMd] = useState(""); - const [importJsonText, setImportJsonText] = useState('{"utterances":[]}'); - const [snapshotConvId, setSnapshotConvId] = useState(""); - - const loadSets = useCallback(async () => { - const r = await api( - "/internal/api/evaluation/regression-sets", - ); - if (r.ok && r.data) { - setSets(r.data); - setSelSetId((cur) => { - if (cur && r.data!.some((s) => s.id === cur)) return cur; - return r.data![0]?.id ?? null; - }); - } else { - pushNotice(r.error ?? "加载数据集失败", "error"); - } - }, [pushNotice]); - - const loadCases = useCallback( - async (setId: string) => { - const r = await api( - `/internal/api/evaluation/regression-sets/${setId}/cases`, - ); - if (r.ok && r.data) { - setCases(r.data); - } else { - setCases([]); - pushNotice(r.error ?? "加载用例失败", "error"); - } - }, - [pushNotice], - ); - - useEffect(() => { - void loadSets(); - }, [loadSets]); - - useEffect(() => { - if (selSetId) void loadCases(selSetId); - else setCases([]); - }, [selSetId, loadCases]); - - async function createSet() { - const r = await api( - "/internal/api/evaluation/regression-sets", - { - method: "POST", - body: JSON.stringify({ name: newSetName, description: "" }), - }, - ); - pushNotice( - r.ok ? "数据集已创建" : (r.error ?? "失败"), - r.ok ? "success" : "error", - ); - if (r.ok) { - await loadSets(); - if (r.data?.id) setSelSetId(r.data.id); - } - } - - async function importMarkdown() { - if (!selSetId) { - pushNotice("请先选择数据集", "error"); - return; - } - if (!importMd.trim()) { - pushNotice("请粘贴 Markdown 内容", "error"); - return; - } - const r = await api( - `/internal/api/evaluation/regression-sets/${selSetId}/import-markdown`, - { - method: "POST", - body: JSON.stringify({ - markdown: importMd, - title: null, - is_protected: false, - }), - }, - ); - pushNotice( - r.ok ? "已从 Markdown 导入用例" : (r.error ?? "导入失败"), - r.ok ? "success" : "error", - ); - if (r.ok) void loadCases(selSetId); - } - - async function importJson() { - if (!selSetId) { - pushNotice("请先选择数据集", "error"); - return; - } - let raw: Record | unknown[] | null = null; - try { - raw = JSON.parse(importJsonText) as Record | unknown[]; - } catch { - pushNotice("JSON 格式无效", "error"); - return; - } - const r = await api("/internal/api/evaluation/import/json-case", { - method: "POST", - body: JSON.stringify({ - regression_set_id: selSetId, - raw_json: raw, - title: null, - is_protected: false, - }), - }); - pushNotice( - r.ok ? "已从 JSON 导入用例" : (r.error ?? "导入失败"), - r.ok ? "success" : "error", - ); - if (r.ok) void loadCases(selSetId); - } - - async function snapshotFromConversation() { - const cid = snapshotConvId.trim(); - if (!selSetId || !cid) { - pushNotice("请选择数据集并填写会话 ID", "error"); - return; - } - const r = await api( - `/internal/api/evaluation/regression-sets/${selSetId}/snapshot-from-conversation/${encodeURIComponent(cid)}`, - { - method: "POST", - body: JSON.stringify({ - title: "", - use_messages: true, - is_protected: false, - }), - }, - ); - pushNotice( - r.ok ? "已从会话生成用例" : (r.error ?? "失败"), - r.ok ? "success" : "error", - ); - if (r.ok) void loadCases(selSetId); - } - - return ( -
-

Datasets · 数据集

-

- 管理回归用例集合:从 Markdown、JSON 或线上会话导入;用例供 Experiments 批量运行。 -

- -
-

数据集列表

-
- setNewSetName(e.target.value)} - aria-label="新数据集名称" - /> - -
- {sets.length === 0 ? ( - - ) : ( -
    - {sets.map((s) => ( -
  • - {" "} - {s.name} - - {" "} - · {formatTime(s.created_at)} ·{" "} - {s.id.slice(0, 8)}… - -
  • - ))} -
- )} -
- -
-

用例({cases.length})

- {!selSetId ? ( -

请选择上方数据集

- ) : cases.length === 0 ? ( - - ) : ( -
-
    - {cases.map((c) => ( -
  • - {c.id.slice(0, 8)}… - {c.title ? ` · ${c.title}` : ""} - - {" "} - · {(c.user_utterances as unknown[])?.length ?? 0} 句 - {c.is_protected ? " · 保护" : ""} - -
  • - ))} -
-
- )} -
- -
-

从会话快照

-
- setSnapshotConvId(e.target.value)} - placeholder="conversation_id" - aria-label="会话 ID" - /> - -
-
- -
-

从 Markdown 导入

-