feat(memoir): 回忆录分段两阶段管线(Phase1 分类 / Phase2 叙事)与配置、测试

This commit is contained in:
Kevin
2026-04-02 16:37:14 +08:00
parent 3ae39838c0
commit 6b930808a3
27 changed files with 1550 additions and 430 deletions

View File

@@ -208,6 +208,9 @@ def _build_era_context(current_stage: str, user_profile_context: str) -> str:
"career": (18, 50),
"family": (20, 50),
"belief": (30, 60),
# chapter / 防御性 key与 belief 同档年龄参照
"beliefs": (30, 60),
"summary": (30, 60),
}
age_range = stage_era_map.get(current_stage, (0, 30))

View File

@@ -12,6 +12,11 @@ from app.agents.chat.stage_prompts import (
get_chat_stage_detection_prompt,
life_stage_display_zh,
)
from app.agents.stage_constants import (
CHAT_STAGES,
STAGE_KEYWORD_WEIGHTS,
normalize_chat_stage,
)
from app.core.config import settings
from app.core.langchain_llm import ainvoke_json_object
from app.core.logging import get_logger
@@ -19,117 +24,26 @@ from app.core.json_utils import extract_json_payload
logger = get_logger(__name__)
# 关键词按阶段打分;同一词不重复出现在多阶段,避免「父母」独占童年。
_STAGE_KEYWORD_WEIGHTS: dict[str, list[tuple[str, int]]] = {
"childhood": [
("童年", 3),
("小时候", 3),
("幼年", 2),
("出生", 2),
("家乡", 2),
("老家", 2),
("小镇", 1),
("幼儿园", 2),
("玩伴", 1),
],
"education": [
("上学", 2),
("学校", 2),
("老师", 2),
("同学", 2),
("教育", 1),
("大学", 3),
("高中", 2),
("初中", 2),
("小学", 2),
("考试", 1),
("毕业", 2),
("读书", 1),
("高考", 2),
("课堂", 1),
("宿舍", 1),
],
"career": [
("工作", 3),
("职业", 2),
("事业", 2),
("公司", 2),
("同事", 2),
("创业", 2),
("升职", 1),
("跳槽", 1),
("老板", 1),
("行业", 1),
("项目", 1),
("加班", 1),
("薪水", 1),
("面试", 1),
("职场", 2),
("离职", 1),
],
"family": [
("伴侣", 2),
("孩子", 2),
("家庭", 2),
("家人", 2),
("结婚", 2),
("爱人", 1),
("老婆", 1),
("老公", 1),
("丈夫", 1),
("妻子", 1),
("儿子", 1),
("女儿", 1),
("婚礼", 1),
("恋爱", 1),
("父母", 2),
("爸妈", 2),
("父亲", 2),
("母亲", 2),
("爷爷", 1),
("奶奶", 1),
("外公", 1),
("外婆", 1),
],
"belief": [
("信念", 2),
("价值观", 2),
("座右铭", 2),
("坚持", 1),
("原则", 1),
("信仰", 1),
("意义", 1),
("感悟", 1),
("遗憾", 1),
("骄傲", 1),
],
}
def normalize_life_stage(raw: Optional[str], fallback: str) -> str:
if not raw or not isinstance(raw, str):
return fallback
s = raw.strip().lower()
if s in VALID_CHAT_LIFE_STAGES:
return s
return fallback
"""兼容旧名:统一走 normalize_chat_stage。"""
return normalize_chat_stage(raw, fallback)
def keyword_fallback_primary_stage(user_message: str) -> str:
"""多阶段打分,取最高分;平局stage_order 靠后的优先(更具体场景常后验)。"""
"""多阶段打分,取最高分;平局按 CHAT_STAGES 逆序优先(与历史 tie_order 派生一致,可能有小幅行为差异)。"""
if not (user_message or "").strip():
return ""
text = user_message
scores: dict[str, int] = {k: 0 for k in _STAGE_KEYWORD_WEIGHTS}
for stage, pairs in _STAGE_KEYWORD_WEIGHTS.items():
scores: dict[str, int] = {k: 0 for k in STAGE_KEYWORD_WEIGHTS}
for stage, pairs in STAGE_KEYWORD_WEIGHTS.items():
for word, w in pairs:
if word in text:
scores[stage] += w
best = max(scores.values())
if best <= 0:
return ""
# 平局education > career > family > belief > childhood避免童年默认胜出
tie_order = ["childhood", "belief", "family", "career", "education"]
tie_order = list(reversed(CHAT_STAGES))
candidates = [s for s, v in scores.items() if v == best]
for s in reversed(tie_order):
if s in candidates:
@@ -145,14 +59,14 @@ async def detect_primary_life_stage(
"""
返回合法的人生阶段 key失败时回退为 current_stage。
"""
fb = normalize_life_stage(current_stage, "childhood")
fb = normalize_chat_stage(current_stage, "childhood")
if not settings.chat_stage_detection_enabled:
k = keyword_fallback_primary_stage(user_message)
return normalize_life_stage(k, fb) if k else fb
return normalize_chat_stage(k, fb) if k else fb
if not llm:
k = keyword_fallback_primary_stage(user_message)
return normalize_life_stage(k, fb) if k else fb
return normalize_chat_stage(k, fb) if k else fb
try:
prompt = get_chat_stage_detection_prompt(user_message, fb)
@@ -164,16 +78,26 @@ async def detect_primary_life_stage(
)
if not raw.strip():
k = keyword_fallback_primary_stage(user_message)
return normalize_life_stage(k, fb) if k else fb
return normalize_chat_stage(k, fb) if k else fb
parsed = json.loads(extract_json_payload(raw))
detected = parsed.get("detected_stage", fb)
return normalize_life_stage(str(detected) if detected is not None else "", fb)
return normalize_chat_stage(str(detected) if detected is not None else "", fb)
except (json.JSONDecodeError, Exception) as e:
logger.warning("detect_primary_life_stage 解析失败,使用关键词回退: {}", e)
k = keyword_fallback_primary_stage(user_message)
return normalize_life_stage(k, fb) if k else fb
return normalize_chat_stage(k, fb) if k else fb
def life_stage_display_name(stage: str) -> str:
"""供提示词展示的中文名。"""
return life_stage_display_zh(stage)
# re-export for modules that still import VALID_CHAT_LIFE_STAGES from stage_detection
__all__ = [
"VALID_CHAT_LIFE_STAGES",
"detect_primary_life_stage",
"keyword_fallback_primary_stage",
"life_stage_display_name",
"normalize_life_stage",
]

View File

@@ -2,9 +2,9 @@
访谈「人生阶段」判定专用短提示词(与回忆录五阶段 slots 一致)。
"""
from app.agents.stage_constants import CHAT_STAGES, STAGE_DISPLAY_ZH
from app.agents.stage_constants import CHAT_STAGES, STAGE_DISPLAY_ZH, VALID_CHAT_STAGES
VALID_CHAT_LIFE_STAGES = frozenset(CHAT_STAGES)
VALID_CHAT_LIFE_STAGES = VALID_CHAT_STAGES
def life_stage_display_zh(stage: str) -> str:

View File

@@ -14,8 +14,11 @@ from dataclasses import dataclass
from typing import Any
from app.agents.memoir.prompts import get_chapter_classification_json_prompt
from app.agents.stage_constants import CHAPTER_CATEGORIES
from app.agents.stage_constants import STAGE_TO_DEFAULT_CATEGORY
from app.agents.stage_constants import (
CHAPTER_CATEGORIES,
STAGE_KEYWORD_WEIGHTS,
STAGE_TO_DEFAULT_CATEGORY,
)
from app.core.json_utils import extract_json_payload
from app.core.langchain_llm import invoke_json_object
from app.core.logging import get_logger
@@ -40,21 +43,12 @@ _SHORT_HUKOU_STYLE = re.compile(
re.UNICODE,
)
# 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别
STAGE_KEYWORDS = {
"childhood": ["童年", "小时候", "家乡", "小镇"],
"education": ["上学", "学校", "老师", "同学", "教育", "大学"],
"career": ["工作", "职业", "事业", "公司", "同事", "创业"],
"family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"],
"belief": ["信念", "价值观", "座右铭", "坚持", "原则"],
}
def _detect_stage(text: str, fallback_stage: str) -> str:
"""根据关键词检测消息所属的 5-stage 阶段"""
"""根据关键词检测消息所属的 5-stage 阶段(与 stage_constants.STAGE_KEYWORD_WEIGHTS 同源;匹配方式为子串,非加权)。"""
message = (text or "").lower()
for stage, keywords in STAGE_KEYWORDS.items():
if any(word in message for word in keywords):
for stage, pairs in STAGE_KEYWORD_WEIGHTS.items():
if any(word in message for word, _w in pairs):
return stage
return fallback_stage

View File

@@ -10,6 +10,7 @@ from dataclasses import dataclass
from typing import Any, Dict
from app.agents.memoir.prompts import get_state_extraction_prompt
from app.agents.stage_constants import normalize_chat_stage
from app.core.langchain_llm import invoke_json_object
from app.core.logging import get_logger
from app.core.json_utils import extract_json_payload
@@ -63,7 +64,11 @@ class ExtractionAgent:
agent="ExtractionAgent.extract",
)
parsed = json.loads(extract_json_payload(raw))
detected_stage = parsed.get("detected_stage", detected_stage)
raw_detected = parsed.get("detected_stage", detected_stage)
detected_stage = normalize_chat_stage(
str(raw_detected) if raw_detected is not None else None,
fallback=current_stage,
)
raw_slots = parsed.get("slots", {}) or {}
extracted_slots = {
k: v if isinstance(v, str) else str(v) for k, v in raw_slots.items()

View File

@@ -8,6 +8,7 @@ from __future__ import annotations
import json
from typing import Any, Dict, Optional
from app.agents.stage_constants import CHAPTER_CATEGORIES
from app.agents.memoir.prompts import (
get_creative_title_json_prompt,
get_narrative_json_prompt,
@@ -34,7 +35,7 @@ class NarrativeAgent:
) -> str:
"""生成创意标题。若无 LLM 则返回默认标题"""
if not llm:
return f"{stage} 回忆"
return f"{CHAPTER_CATEGORIES.get(stage, stage)} 回忆"
try:
prompt = get_creative_title_json_prompt(
stage=stage,
@@ -53,10 +54,10 @@ class NarrativeAgent:
title = (data.get("title") or "").strip() if isinstance(data, dict) else ""
if title:
return title.strip('"')
return f"{stage} 回忆"
return f"{CHAPTER_CATEGORIES.get(stage, stage)} 回忆"
except Exception as e:
logger.warning("NarrativeAgent 生成标题失败: {}", e)
return f"{stage} 回忆"
return f"{CHAPTER_CATEGORIES.get(stage, stage)} 回忆"
def generate_narrative(
self,

View File

@@ -33,6 +33,8 @@ class PreparedMemoirBatches:
category_to_segments: Dict[str, List[Segment]]
#: segment id 在「LLM 判 none 且 extraction slots 为空」时加入batch 级短路见 memoir_tasks
segment_skip_story_ids: Set[str]
#: 每个 segment → Phase 1 分类 chapter_category持久化到 Segment.topic_category
segment_chapter_category: Dict[str, str]
class MemoirOrchestrator:
@@ -64,6 +66,7 @@ class MemoirOrchestrator:
state = get_or_create_state()
category_to_segments: Dict[str, List[Segment]] = {}
segment_skip_story_ids: Set[str] = set()
segment_chapter_category: Dict[str, str] = {}
classify_extract_llm = llm_fast if llm_fast is not None else llm
for segment in segments:
@@ -103,6 +106,7 @@ class MemoirOrchestrator:
chapter_category = classify_result.category
if (not result.slots) and classify_result.llm_said_none:
segment_skip_story_ids.add(str(segment.id))
segment_chapter_category[str(segment.id)] = chapter_category
if agent_summary_enabled():
logger.info(
@@ -126,6 +130,7 @@ class MemoirOrchestrator:
state=state,
category_to_segments=category_to_segments,
segment_skip_story_ids=segment_skip_story_ids,
segment_chapter_category=segment_chapter_category,
)
def run(

View File

@@ -2,6 +2,12 @@
from __future__ import annotations
from typing import Any
from app.core.logging import get_logger
logger = get_logger(__name__)
# 访谈五阶段(与 MemoirStateSchema.default_slots 顺序一致)
CHAT_STAGES: tuple[str, ...] = (
"childhood",
@@ -11,6 +17,8 @@ CHAT_STAGES: tuple[str, ...] = (
"belief",
)
VALID_CHAT_STAGES: frozenset[str] = frozenset(CHAT_STAGES)
STAGE_DISPLAY_ZH = {
"childhood": "童年时光",
"education": "求学经历",
@@ -38,6 +46,8 @@ CHAPTER_CATEGORIES = {
"summary": "人生总结",
}
VALID_CHAPTER_CATEGORIES: frozenset[str] = frozenset(CHAPTER_CATEGORIES.keys())
CHAPTER_ORDER = [
"childhood",
"education",
@@ -49,6 +59,7 @@ CHAPTER_ORDER = [
"summary",
]
# career / career_early 与 belief / beliefs 共用序号chat-stage 别名与 chapter category 兼容映射
STAGE_TO_ORDER = {
"childhood": 0,
"education": 1,
@@ -72,3 +83,190 @@ CATEGORY_TO_CHAT_STAGE: dict[str, str] = {
"beliefs": "belief",
"summary": "belief",
}
# 访谈关键词加权chat 路径打分classification 与子串检测共用此数据源
STAGE_KEYWORD_WEIGHTS: dict[str, list[tuple[str, int]]] = {
"childhood": [
("童年", 3),
("小时候", 3),
("幼年", 2),
("出生", 2),
("家乡", 2),
("老家", 2),
("小镇", 1),
("幼儿园", 2),
("玩伴", 1),
],
"education": [
("上学", 2),
("学校", 2),
("老师", 2),
("同学", 2),
("教育", 1),
("大学", 3),
("高中", 2),
("初中", 2),
("小学", 2),
("考试", 1),
("毕业", 2),
("读书", 1),
("高考", 2),
("课堂", 1),
("宿舍", 1),
],
"career": [
("工作", 3),
("职业", 2),
("事业", 2),
("公司", 2),
("同事", 2),
("创业", 2),
("升职", 1),
("跳槽", 1),
("老板", 1),
("行业", 1),
("项目", 1),
("加班", 1),
("薪水", 1),
("面试", 1),
("职场", 2),
("离职", 1),
],
"family": [
("伴侣", 2),
("孩子", 2),
("家庭", 2),
("家人", 2),
("结婚", 2),
("爱人", 1),
("老婆", 1),
("老公", 1),
("丈夫", 1),
("妻子", 1),
("儿子", 1),
("女儿", 1),
("婚礼", 1),
("恋爱", 1),
("父母", 2),
("爸妈", 2),
("父亲", 2),
("母亲", 2),
("爷爷", 1),
("奶奶", 1),
("外公", 1),
("外婆", 1),
],
"belief": [
("信念", 2),
("价值观", 2),
("座右铭", 2),
("坚持", 1),
("原则", 1),
("信仰", 1),
("意义", 1),
("感悟", 1),
("遗憾", 1),
("骄傲", 1),
],
}
# 模型/任务偶发使用 chapter key 当 chat stage 时收束到 bucket
_CHAT_STAGE_SYNONYMS: dict[str, str] = {
"beliefs": "belief",
}
def chat_bucket(stage_or_category: str | None) -> str:
"""将 chat stage 或 chapter category 收束到 CHAT_STAGES 之一。"""
if not stage_or_category or not isinstance(stage_or_category, str):
return ""
s = stage_or_category.strip().lower()
if s in VALID_CHAT_STAGES:
return s
if s in _CHAT_STAGE_SYNONYMS:
return _CHAT_STAGE_SYNONYMS[s]
if s in CATEGORY_TO_CHAT_STAGE:
return CATEGORY_TO_CHAT_STAGE[s]
return ""
def normalize_chat_stage(
raw: str | None,
fallback: str,
*,
log_context: dict[str, Any] | None = None,
) -> str:
"""校验并归一化 chat stage非法非空输入回落到 fallback 并可选结构化日志。"""
if not raw or not isinstance(raw, str):
return fallback
stripped = raw.strip()
if not stripped:
return fallback
s = stripped.lower()
if s in VALID_CHAT_STAGES:
return s
if s in _CHAT_STAGE_SYNONYMS:
return _CHAT_STAGE_SYNONYMS[s]
if s in CATEGORY_TO_CHAT_STAGE:
mapped = CATEGORY_TO_CHAT_STAGE[s]
if log_context:
logger.bind(**log_context).info(
"event=normalize_chat_stage_mapped raw={} mapped={}",
raw,
mapped,
)
return mapped
fb = (
fallback.strip().lower()
if isinstance(fallback, str) and fallback.strip()
else "childhood"
)
if fb in VALID_CHAT_STAGES:
pass
elif fb in _CHAT_STAGE_SYNONYMS:
fb = _CHAT_STAGE_SYNONYMS[fb]
elif fb in CATEGORY_TO_CHAT_STAGE:
fb = CATEGORY_TO_CHAT_STAGE[fb]
else:
fb = "childhood"
if log_context:
logger.bind(**log_context).info(
"event=normalize_chat_stage_fallback raw={} fallback={}",
raw,
fb,
)
return fb
def normalize_chapter_category(
raw: str | None,
fallback: str,
*,
log_context: dict[str, Any] | None = None,
) -> str:
"""校验 chapter category keyCHAPTER_CATEGORIES 的键)。"""
if not raw or not isinstance(raw, str):
out = fallback if fallback in VALID_CHAPTER_CATEGORIES else "summary"
return out
s = raw.strip().lower()
if s.startswith("`"):
s = s.strip("`").strip()
if (s.startswith('"') and s.endswith('"')) or (
s.startswith("'") and s.endswith("'")
):
s = s[1:-1].strip().lower()
if s in VALID_CHAPTER_CATEGORIES:
return s
fb = (
fallback
if isinstance(fallback, str)
and fallback.strip().lower() in VALID_CHAPTER_CATEGORIES
else "summary"
)
if log_context:
logger.bind(**log_context).info(
"event=normalize_chapter_category_fallback raw={} fallback={}",
raw,
fb,
)
return fb