diff --git a/api/app/agents/memoir/prompts.py b/api/app/agents/memoir/prompts.py index 102e50a..81e89c1 100644 --- a/api/app/agents/memoir/prompts.py +++ b/api/app/agents/memoir/prompts.py @@ -401,8 +401,6 @@ def get_story_route_prompt( 「故事」在此指:**可独立讲述的一段人生经历**——单一主题或同一事件链;不要假设本批里包含多个互不相关的故事(多段由系统其它步骤处理)。 -**new_story_title 与 reason 只能依据口述中已有信息概括,不得编造口述未出现的人、事、地、物。** - **路由边界(必须遵守)**:仅根据下方「本批口述合并文本」判断 new_story 与 append_story;不得将系统检索摘要、记忆摘录、图谱事实或其它非用户口述材料当作本批口述内容来匹配候选故事。 当前章节(写作容器): @@ -419,13 +417,11 @@ def get_story_route_prompt( {{ "decision": "new_story" | "append_story", "target_story_id": "", - "new_story_title": "<短标题,6-20 字;new_story 时必填,append 时可 null>", "reason": "<一句中文理由>" }} 规则: - 若无法自信匹配某一候选,选 new_story -- new_story_title 应概括本批新内容,不要与候选标题重复 """ @@ -444,8 +440,6 @@ def get_story_batch_plan_prompt( ## 「故事」定义(必须遵守) 一段「故事」= **可独立讲述的一段人生经历**:单一主题或同一事件链,能单独成篇。若话题切换、时间线跳到另一件事、人物/主线明显变化,应作为**新的故事**(new_story),而不是塞进同一段 append。 -**new_story_title 与 reason 只能依据各 segment 文本中已有信息,不得编造口述未出现的事实。** - ## 任务 将本批 segment **划分为连续若干块**(每块包含至少一个 segment,顺序不能打乱;每个 segment 必须恰好属于一块)。对每一块决定: - **append_story**:内容明显延续、补充**某一已有候选故事**的主题与时间线,且能对应到具体 candidate id @@ -468,7 +462,6 @@ def get_story_batch_plan_prompt( "segment_ids": ["<按顺序列出本块包含的 segment id>"], "decision": "new_story" | "append_story", "target_story_id": "", - "new_story_title": "<短标题,6-20 字;new_story 时必填,append 时可 null>", "reason": "<一句中文理由,可选>" }} ] @@ -477,7 +470,6 @@ def get_story_batch_plan_prompt( 规则: - `units` 中所有 `segment_ids` 拼接后,必须**不重不漏**地覆盖本批全部 id,且顺序与【本批口述片段】数组一致 - 若无法自信匹配某一候选,对该块选 new_story -- new_story_title 应概括该块内容,不要与候选标题重复 """ diff --git a/api/app/agents/memoir/story_route_agent.py b/api/app/agents/memoir/story_route_agent.py index 449d0cb..a7aadb2 100644 --- a/api/app/agents/memoir/story_route_agent.py +++ b/api/app/agents/memoir/story_route_agent.py @@ -116,7 +116,8 @@ def validate_story_batch_plan( valid_story_ids: set[str], ) -> tuple[bool, str | None]: """ - 校验:segment 全覆盖、顺序一致、append 目标合法、new_story 有标题。 + 校验:segment 全覆盖、顺序一致、append 目标合法。 + 标题由 NarrativeAgent 延迟生成,路由阶段不再要求 new_story_title。 返回 (ok, error_code)。 """ if not plan.units: @@ -135,10 +136,6 @@ def validate_story_batch_plan( tid = u.target_story_id if not tid or tid not in valid_story_ids: return False, "invalid_append_target" - else: - title = (u.new_story_title or "").strip() - if not title: - return False, "missing_new_title" return True, None @@ -196,10 +193,6 @@ class StoryRouteAgent: new_story_title=decision.new_story_title, reason="invalid_target", ) - if decision.decision == "new_story" and not ( - decision.new_story_title and decision.new_story_title.strip() - ): - decision.new_story_title = None return decision def plan_batch( diff --git a/api/app/agents/stage_constants.py b/api/app/agents/stage_constants.py index 3b9f50c..a3bebd6 100644 --- a/api/app/agents/stage_constants.py +++ b/api/app/agents/stage_constants.py @@ -61,3 +61,14 @@ STAGE_TO_ORDER = { "beliefs": 6, "summary": 7, } + +CATEGORY_TO_CHAT_STAGE: dict[str, str] = { + "childhood": "childhood", + "education": "education", + "career_early": "career", + "career_achievement": "career", + "career_challenge": "career", + "family": "family", + "beliefs": "belief", + "summary": "belief", +} diff --git a/api/app/core/config.py b/api/app/core/config.py index 452d7c4..34882c9 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -205,9 +205,8 @@ class Settings(BaseSettings): evidence_top_k_default: int = Field(default=10, ge=1, le=50) evidence_top_k_large_batch: int = Field(default=5, ge=1, le=50) evidence_large_batch_threshold: int = Field(default=3, ge=1, le=100) - # 叙事输出相对口述极端过短才回退(仅防极端压缩;0.3 = 模型输出不到口述 30% 才触发) - memoir_narrative_fallback_body_ratio: float = 0.3 - memoir_narrative_fallback_min_chars: int = 15 + # Story/Chapter 标题在正文达到此字数后才由 LLM 生成;之前用占位符 + story_title_min_body_chars: int = Field(default=60, ge=0, le=10_000) # 回忆录 Celery:累计 strip 后口述字数未达此值则暂缓提交(0=关闭,仅防抖后提交) memoir_segment_batch_min_chars: int = Field(default=50, ge=0, le=50_000) # 本批首条 segment 入队起最长等待(秒),超时则提交(即使字数不足) diff --git a/api/app/features/memoir/story_pipeline_sync.py b/api/app/features/memoir/story_pipeline_sync.py index bf457da..01a7c53 100644 --- a/api/app/features/memoir/story_pipeline_sync.py +++ b/api/app/features/memoir/story_pipeline_sync.py @@ -19,7 +19,11 @@ from app.agents.memoir.prompts import ( format_evidence_chunks_for_prompt, format_narrative_user_content, ) -from app.agents.stage_constants import STAGE_TO_ORDER +from app.agents.stage_constants import ( + CATEGORY_TO_CHAT_STAGE, + CHAPTER_CATEGORIES, + STAGE_TO_ORDER, +) from app.agents.memoir.story_route_agent import ( PLAN_BATCH_MAX_SEGMENTS, StoryBatchPlan, @@ -53,6 +57,38 @@ from app.features.story.sync_write import ( logger = get_logger(__name__) +def _placeholder_title(chapter_category: str) -> str: + return CHAPTER_CATEGORIES.get(chapter_category, chapter_category) + + +def _maybe_generate_title( + narrative_agent: "NarrativeAgent", + *, + chapter_category: str, + md: str, + slot_snippets: dict[str, str], + user_profile: str, + user_birth_year: int | None, + llm: Any, +) -> str: + """Generate a title only when body is long enough; otherwise return placeholder.""" + body_len = len((md or "").strip()) + if body_len < settings.story_title_min_body_chars: + return _placeholder_title(chapter_category) + content_excerpt = (md or "").strip()[:300] + merged_slots = dict(slot_snippets) + if content_excerpt and "content_excerpt" not in merged_slots: + merged_slots["content_excerpt"] = content_excerpt + return narrative_agent.generate_title( + stage=chapter_category, + emotion="neutral", + slots=merged_slots, + user_profile=user_profile, + birth_year=user_birth_year, + llm=llm, + ) + + def _route_segment_texts(category_segments: list) -> list[tuple[str, str]]: """批量路由 plan_batch:每段仅做规则归一,避免 N 次 LLM。""" out: list[tuple[str, str]] = [] @@ -122,28 +158,12 @@ def _gate_narrative_fidelity( return _fidelity_fallback_json(o, ex), "fidelity_failed" -def _should_fallback_to_transcript(md: str, oral: str) -> bool: - """模型输出相对口述极度过短时才回退(仅防极端压缩如「1999」)。""" - o = (oral or "").strip() - if not o: - return False - m = (md or "").strip() - if not m: - return True - if len(o) < 12: - return len(m) < len(o) - ratio = float(settings.memoir_narrative_fallback_body_ratio) - min_abs = int(settings.memoir_narrative_fallback_min_chars) - threshold = max(min_abs, int(len(o) * ratio)) - return len(m) < threshold - - def _coalesce_story_markdown( md: str, oral: str, existing_for_narrative: str, ) -> str: - """落库前对齐正文:空输出或过短回退时,续写场景保留「已有故事 + 本段口述」。""" + """落库前对齐正文:空输出时续写场景保留「已有故事 + 本段口述」。""" o = (oral or "").strip() ex = (existing_for_narrative or "").strip() m = (md or "").strip() @@ -153,10 +173,6 @@ def _coalesce_story_markdown( if o: return o return ex - if o and _should_fallback_to_transcript(m, o): - if ex: - return f"{ex}\n\n{o}" - return o return m @@ -181,8 +197,10 @@ def _apply_narrative_fallbacks( *, chapter_category: str, ) -> tuple[str, str]: - """返回 (文本, fallback_type);无改写时为 none。""" - # 整篇合并(JSON)输出异常缩水:回退为旧文 + 本段口述,避免覆盖丢失 + """返回 (文本, fallback_type);无改写时为 none。 + + 仅防 merge/append 场景下模型输出极端缩水(丢旧内容),不再按口述字数比例回退。 + """ if existing_for_narrative and _is_json_narrative(narrative_raw): merged_md = narrative_to_markdown(narrative_raw).strip() ex = (existing_for_narrative or "").strip() @@ -209,28 +227,6 @@ def _apply_narrative_fallbacks( "coalesce_to_old_plus_oral", ) - md_check = narrative_to_markdown(narrative_raw).strip() - oral = (combined_unit_text or "").strip() - ex_fb = (existing_for_narrative or "").strip() - if oral and _should_fallback_to_transcript(md_check, oral): - if ex_fb: - logger.warning( - "event=narrative_fallback reason=body_too_short_vs_oral_merge " - "chapter_category={} oral_len={} md_len={}", - chapter_category, - len(oral), - len(md_check), - ) - return f"{ex_fb}\n\n{oral}", "coalesce_to_old_plus_oral" - logger.warning( - "event=narrative_fallback reason=body_too_short_vs_oral " - "chapter_category={} oral_len={} md_len={}", - chapter_category, - len(oral), - len(md_check), - ) - return oral, "coalesce_to_oral" - return narrative_raw, "none" @@ -404,16 +400,15 @@ def _run_batch_plan_writes( sid_log = target_story_id is_append = True else: - story_title = (unit.new_story_title or "").strip() - if not story_title: - story_title = narrative_agent.generate_title( - stage=chapter_category, - emotion="neutral", - slots=slot_snippets, - user_profile=user_profile, - birth_year=user_birth_year, - llm=llm, - ) + story_title = _maybe_generate_title( + narrative_agent, + chapter_category=chapter_category, + md=md, + slot_snippets=slot_snippets, + user_profile=user_profile, + user_birth_year=user_birth_year, + llm=llm, + ) st = create_story_with_version_sync( session, user_id=user_id, @@ -519,7 +514,8 @@ def run_story_pipeline_for_category_batch( chapter = session.execute(stmt_chapter).unique().scalar_one_or_none() slot_snippets: dict[str, str] = {} - stage_slots = state.slots.get(chapter_category, {}) or {} + chat_stage = CATEGORY_TO_CHAT_STAGE.get(chapter_category, chapter_category) + stage_slots = state.slots.get(chat_stage, {}) or {} for key, value in stage_slots.items(): snip = getattr(value, "snippet", None) or ( value.get("snippet") if isinstance(value, dict) else None @@ -527,17 +523,7 @@ def run_story_pipeline_for_category_batch( if snip: slot_snippets[key] = snip - title = chapter.title if chapter else f"{chapter_category} 回忆" - - if not chapter: - title = narrative_agent.generate_title( - stage=chapter_category, - emotion="neutral", - slots=slot_snippets, - user_profile=user_profile, - birth_year=user_birth_year, - llm=llm, - ) + title = chapter.title if chapter else _placeholder_title(chapter_category) # 仅同 chapter_category(story.stage)的 Story 可作为 append 候选,避免跨章节链接导致多章内容相同 all_stories = list_active_stories_for_user_sync(session, user_id) @@ -684,16 +670,15 @@ def run_story_pipeline_for_category_batch( sid_log = target_story_id is_append = True else: - story_title = (route.new_story_title or "").strip() - if not story_title: - story_title = narrative_agent.generate_title( - stage=chapter_category, - emotion="neutral", - slots=slot_snippets, - user_profile=user_profile, - birth_year=user_birth_year, - llm=llm, - ) + story_title = _maybe_generate_title( + narrative_agent, + chapter_category=chapter_category, + md=md, + slot_snippets=slot_snippets, + user_profile=user_profile, + user_birth_year=user_birth_year, + llm=llm, + ) st = create_story_with_version_sync( session, user_id=user_id, diff --git a/api/app/features/memory/evidence_format.py b/api/app/features/memory/evidence_format.py index 8a66dcf..985d352 100644 --- a/api/app/features/memory/evidence_format.py +++ b/api/app/features/memory/evidence_format.py @@ -4,6 +4,7 @@ from __future__ import annotations +import json import re @@ -46,6 +47,25 @@ def dedupe_evidence_chunk_rows(chunks: list) -> list: return [x[1] for x in kept] +def _flatten_object_json(obj_raw: object) -> str: + """Extract readable text from fact object_json (may be dict, JSON string, or plain str).""" + if isinstance(obj_raw, dict): + return str(obj_raw.get("value", "")) or ", ".join( + f"{k}={v}" for k, v in obj_raw.items() if v + ) + if isinstance(obj_raw, str): + s = obj_raw.strip() + if s.startswith("{"): + try: + parsed = json.loads(s) + if isinstance(parsed, dict): + return str(parsed.get("value", s)) or s + except (json.JSONDecodeError, TypeError): + pass + return s + return str(obj_raw) if obj_raw else "" + + def format_evidence_chunks_for_prompt(evidence: dict) -> str: """将 retrieve_evidence / retrieve_evidence_sync 结果格式化为简短文本,供叙事与访谈 prompt 使用。 @@ -75,11 +95,15 @@ def format_evidence_chunks_for_prompt(evidence: dict) -> str: if isinstance(f, dict): subj = f.get("subject", "") pred = f.get("predicate", "") - obj = f.get("object_json", "") + obj_raw = f.get("object_json", "") + obj = _flatten_object_json(obj_raw) if subj or pred: - parts.append(f"{subj} {pred} {obj}") + if obj: + parts.append(f"{subj}:{pred}({obj})") + else: + parts.append(f"{subj}:{pred}") else: - parts.append(f"{getattr(f, 'subject', '')} {getattr(f, 'predicate', '')}") + parts.append(f"{getattr(f, 'subject', '')}:{getattr(f, 'predicate', '')}") for t in timeline[:5]: if isinstance(t, dict): title = (t.get("title") or "").strip() diff --git a/api/tests/test_experience_regressions.py b/api/tests/test_experience_regressions.py index c412c3d..4f186ce 100644 --- a/api/tests/test_experience_regressions.py +++ b/api/tests/test_experience_regressions.py @@ -186,12 +186,6 @@ class TestMemoirStyleRegressions: ) assert "文采服务于真实" in prompt or "虚构描写" in prompt - def test_fallback_ratio_is_lenient(self) -> None: - """fallback 阈值应该宽松——只有极端压缩才触发,正常书面化改写不触发。""" - oral = "我一九九九年出生在上海,后来搬到苏州。小学时爷爷常带我去河边散步。" - half_length_md = oral[: len(oral) // 2 + 5] - assert not sps._should_fallback_to_transcript(half_length_md, oral) - def test_merge_shrink_only_on_extreme_loss(self) -> None: """合并场景只有在极端缩水时才触发 fallback,不因正常重组而退回。""" existing = "这是一段已有的故事正文,讲述了童年在河边的回忆。" * 20 diff --git a/api/tests/test_narrative_pipeline.py b/api/tests/test_narrative_pipeline.py index ee9d303..badb4c9 100644 --- a/api/tests/test_narrative_pipeline.py +++ b/api/tests/test_narrative_pipeline.py @@ -1,6 +1,4 @@ -"""叙事分区、口述过短回退、配图字数门闸(纯函数/无 DB)。""" - -import pytest +"""叙事分区、merge_shrink 回退、配图字数门闸(纯函数/无 DB)。""" from app.agents.memoir.prompts import format_narrative_user_content from app.features.memoir import story_pipeline_sync as sps @@ -18,16 +16,6 @@ def test_format_narrative_user_content_with_evidence() -> None: assert "非本段口述" in out -def test_should_fallback_to_transcript_short_md( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setattr(sps.settings, "memoir_narrative_fallback_body_ratio", 0.5) - monkeypatch.setattr(sps.settings, "memoir_narrative_fallback_min_chars", 20) - oral = "我一九九九年出生在上海,后来全家搬到苏州生活了好几年。" - assert sps._should_fallback_to_transcript("1999", oral) is True - assert sps._should_fallback_to_transcript(oral, oral) is False - - def test_apply_narrative_fallbacks_merge_shrink_appends_oral() -> None: """整篇合并 JSON 输出过短:保留旧文并拼本段口述。""" long_existing = "x" * 500 @@ -42,20 +30,27 @@ def test_apply_narrative_fallbacks_merge_shrink_appends_oral() -> None: assert "新口述补充" in out -def test_apply_narrative_fallbacks_json_too_short_returns_oral( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setattr(sps.settings, "memoir_narrative_fallback_body_ratio", 0.5) - monkeypatch.setattr(sps.settings, "memoir_narrative_fallback_min_chars", 20) +def test_apply_narrative_fallbacks_short_output_no_longer_falls_back() -> None: + """短口述的正常改写不应被回退到口述原文。""" oral = "我1999年出生在上海,小学时爷爷常带我去河边散步。" - raw = '{"paragraphs": [{"content": "1999"}]}' - out, _ft = sps._apply_narrative_fallbacks( - raw, - oral, - "", - chapter_category="childhood", + raw = '{"paragraphs": [{"content": "1999年,我出生在上海。"}]}' + out, ft = sps._apply_narrative_fallbacks( + raw, oral, "", chapter_category="childhood" ) - assert out.strip() == oral + assert ft == "none" + assert "1999" in out + + +def test_coalesce_story_markdown_empty_md_falls_back_to_oral() -> None: + """模型返回空 paragraphs 时仍回退到口述原文。""" + md = sps._coalesce_story_markdown("", "口述原文", "") + assert md == "口述原文" + + +def test_coalesce_story_markdown_nonempty_md_kept() -> None: + """非空改写不再按字数比例回退。""" + md = sps._coalesce_story_markdown("改写后的短文本", "原始口述比较长的一段话", "") + assert md == "改写后的短文本" def test_memoir_image_settings_min_body_field() -> None: