feat(api): 访谈人格/回复长度策略、口述归一、背景语气与输入净稿全链路

Chat 访谈
- 新增 persona 系统(default / warm_listener / curious_guide)与 background_voice 语气层
- 回复长度由 compute_reply_plan 统一决策(brief / standard / expanded),融合信息密度启发式
- 输入净稿(input_normalize):编排层可选 rules/llm 归一用户口语后再喂模型与记忆检索
- 记忆证据注入:按用户话检索 memory evidence 并注入 prompt

Memoir 回忆录
- 口述归一(oral_normalize):segment 原文保留,story 管线取派生净稿作叙事输入
- segment 入队批次门闸:累计字数 + 最长等待秒数,减少零碎提交
- fidelity_check / prompts / narrative_agent 微调
- Alembic 0005:清理跨章节 story 外键

Infra
- Dockerfile 加入 ffmpeg
- pyproject.toml 新增依赖并同步 uv.lock
- .env.example / .env.production 补全新配置项

Tests
- 新增 test_background_voice、test_chat_input_normalize、test_experience_regressions
- 扩展 test_interview_prompts、test_interview_reply_length、test_story_route_oral_invariant

Made-with: Cursor
This commit is contained in:
Kevin
2026-03-31 23:55:26 +08:00
parent 42ae2a5e91
commit 69a673e6c6
44 changed files with 2998 additions and 259 deletions

View File

@@ -32,6 +32,10 @@ from app.features.memoir.cover_eligibility import chapter_needs_cover_enqueue
from app.features.memoir.memoir_images.settings import MemoirImageSettings
from app.features.memoir.models import Chapter
from app.features.memoir.narrative_to_markdown import narrative_to_markdown
from app.features.memoir.oral_normalize import (
apply_oral_normalization_rules,
normalize_oral_for_memoir,
)
from app.features.memoir.repo import (
mark_chapter_dirty_sync,
reorder_chapter_story_links_by_life_order_sync,
@@ -49,6 +53,23 @@ from app.features.story.sync_write import (
logger = get_logger(__name__)
def _route_segment_texts(category_segments: list) -> list[tuple[str, str]]:
"""批量路由 plan_batch每段仅做规则归一避免 N 次 LLM。"""
out: list[tuple[str, str]] = []
for seg in category_segments:
raw = seg.user_input_text or ""
if (
settings.memoir_oral_normalize_enabled
and (settings.memoir_oral_normalize_mode or "rules").strip().lower()
!= "off"
):
t = apply_oral_normalization_rules(raw)
else:
t = raw
out.append((str(seg.id), t))
return out
def _fidelity_fallback_json(oral: str, existing_canonical: str | None) -> str:
"""忠实度未通过时的安全回退:续写场景保留旧文 + 本段口述,避免只剩一句。"""
o = (oral or "").strip()[:15000]
@@ -102,7 +123,7 @@ def _gate_narrative_fidelity(
def _should_fallback_to_transcript(md: str, oral: str) -> bool:
"""模型输出相对口述明显过短时回退为口述原文(防「1999」类压缩)。"""
"""模型输出相对口述极度过短时回退(仅防极端压缩如「1999」"""
o = (oral or "").strip()
if not o:
return False
@@ -165,7 +186,7 @@ def _apply_narrative_fallbacks(
if existing_for_narrative and _is_json_narrative(narrative_raw):
merged_md = narrative_to_markdown(narrative_raw).strip()
ex = (existing_for_narrative or "").strip()
if ex and len(ex) > 400 and len(merged_md) < len(ex) * 0.35:
if ex and len(ex) > 400 and len(merged_md) < len(ex) * 0.25:
logger.warning(
"event=narrative_fallback reason=merge_shrink action=append_oral "
"chapter_category={}",
@@ -176,7 +197,7 @@ def _apply_narrative_fallbacks(
if (
existing_for_narrative
and not _is_json_narrative(narrative_raw)
and len(narrative_raw) < len(existing_for_narrative) * 0.8
and len(narrative_raw) < len(existing_for_narrative) * 0.5
):
logger.warning(
"event=narrative_fallback reason=length_anomaly action=append_raw "
@@ -290,6 +311,7 @@ def _run_batch_plan_writes(
user_birth_year: int | None,
llm: Any,
narrative_agent: NarrativeAgent,
background_voice: str = "default",
) -> set[str]:
dispatch_ids: set[str] = set()
max_chars = int(settings.story_append_max_canonical_chars)
@@ -297,7 +319,16 @@ def _run_batch_plan_writes(
for unit in plan.units:
t0 = time.perf_counter()
unit_text = _ordered_text_for_segment_ids(category_segments, unit.segment_ids)
new_content_input = format_narrative_user_content(unit_text, evidence_text)
oral_unit = normalize_oral_for_memoir(unit_text, llm=llm)
ut_raw = (unit_text or "").strip()
ut_norm = (oral_unit or "").strip()
if ut_raw != ut_norm:
logger.info(
"event=oral_normalized context=batch_unit raw_len={} norm_len={}",
len(ut_raw),
len(ut_norm),
)
new_content_input = format_narrative_user_content(oral_unit, evidence_text)
target_story_id: str | None = None
existing_for_narrative = ""
@@ -330,6 +361,7 @@ def _run_batch_plan_writes(
user_profile=user_profile,
birth_year=user_birth_year,
llm=llm,
background_voice=background_voice,
)
json_invalid = False
s0 = (raw_gen or "").strip()
@@ -340,14 +372,14 @@ def _run_batch_plan_writes(
json_invalid = True
narrative_raw, fb_gate = _gate_narrative_fidelity(
unit_text,
oral_unit,
raw_gen,
llm,
existing_canonical=existing_for_narrative or None,
)
narrative_raw, fb_apply = _apply_narrative_fallbacks(
narrative_raw,
unit_text,
oral_unit,
existing_for_narrative,
chapter_category=chapter_category,
)
@@ -357,7 +389,7 @@ def _run_batch_plan_writes(
md = _coalesce_story_markdown(
narrative_to_markdown(narrative_raw).strip(),
unit_text.strip(),
oral_unit.strip(),
existing_for_narrative or "",
)
@@ -399,7 +431,7 @@ def _run_batch_plan_writes(
"event=story_generated route_type=batch decision_source={} route_decision={} "
"unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
"fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
"story_id={} seconds={:.3f}",
"story_id={} seconds={:.3f} oral_normalize_changed={}",
decision_source,
unit.decision,
len(unit.segment_ids),
@@ -407,12 +439,13 @@ def _run_batch_plan_writes(
_is_json_narrative(raw_gen),
fb_gate == "none",
fallback_type,
len(unit_text.strip()),
len(ut_norm),
len(md.strip()),
chapter_category,
is_append,
sid_log,
elapsed,
ut_raw != ut_norm,
)
return dispatch_ids
@@ -427,6 +460,7 @@ def run_story_pipeline_for_category_batch(
user_profile: str,
user_birth_year: int | None,
llm: Any,
background_voice: str = "default",
) -> tuple[Chapter | None, bool, set[str]]:
"""
返回 (chapter, needs_cover_enqueue, story_ids_to_dispatch_after_commit)。
@@ -456,7 +490,16 @@ def run_story_pipeline_for_category_batch(
}
evidence_text = format_evidence_chunks_for_prompt(evidence)
new_content_input = format_narrative_user_content(combined_text, evidence_text)
oral_for_memoir = normalize_oral_for_memoir(combined_text, llm=llm)
ct_raw = (combined_text or "").strip()
om_norm = (oral_for_memoir or "").strip()
if ct_raw != om_norm:
logger.info(
"event=oral_normalized context=category_batch raw_len={} norm_len={}",
len(ct_raw),
len(om_norm),
)
new_content_input = format_narrative_user_content(oral_for_memoir, evidence_text)
stmt_chapter = (
select(Chapter)
@@ -493,15 +536,14 @@ def run_story_pipeline_for_category_batch(
llm=llm,
)
candidates = list_active_stories_for_user_sync(session, user_id)
# 仅同 chapter_categorystory.stage的 Story 可作为 append 候选,避免跨章节链接导致多章内容相同
all_stories = list_active_stories_for_user_sync(session, user_id)
candidates = [s for s in all_stories if s.stage == chapter_category]
valid_ids = {str(s.id) for s in candidates}
story_meta = _story_meta_for_route(session, candidates)
batch_for_route = (
f"{combined_text}\n\n{evidence_text}"
if evidence_text.strip()
else combined_text
)
# Story route 仅依据本批用户口述evidence 只进入叙事/合并,不参与 new/append 判定。
route_transcript = oral_for_memoir
calculated_order_index = STAGE_TO_ORDER.get(chapter_category, 999)
@@ -512,7 +554,7 @@ def run_story_pipeline_for_category_batch(
)
plan: StoryBatchPlan | None = None
if use_batch_plan:
segs = [(seg.id, seg.user_input_text or "") for seg in category_segments]
segs = _route_segment_texts(category_segments)
plan = route_agent.plan_batch(
chapter_category=chapter_category,
chapter_title=title,
@@ -546,12 +588,13 @@ def run_story_pipeline_for_category_batch(
user_birth_year=user_birth_year,
llm=llm,
narrative_agent=narrative_agent,
background_voice=background_voice,
)
else:
route = route_agent.decide(
chapter_category=chapter_category,
chapter_title=title,
batch_transcript=batch_for_route,
batch_transcript=route_transcript,
candidate_stories=candidates,
llm=llm,
valid_story_ids=valid_ids,
@@ -592,6 +635,7 @@ def run_story_pipeline_for_category_batch(
user_profile=user_profile,
birth_year=user_birth_year,
llm=llm,
background_voice=background_voice,
)
json_invalid = False
s0 = (raw_gen or "").strip()
@@ -602,7 +646,7 @@ def run_story_pipeline_for_category_batch(
json_invalid = True
narrative_raw, fb_gate = _gate_narrative_fidelity(
combined_text,
oral_for_memoir,
raw_gen,
llm,
existing_canonical=existing_for_narrative or None,
@@ -610,7 +654,7 @@ def run_story_pipeline_for_category_batch(
narrative_raw, fb_apply = _apply_narrative_fallbacks(
narrative_raw,
combined_text,
oral_for_memoir,
existing_for_narrative,
chapter_category=chapter_category,
)
@@ -620,7 +664,7 @@ def run_story_pipeline_for_category_batch(
md = _coalesce_story_markdown(
narrative_to_markdown(narrative_raw).strip(),
combined_text.strip(),
oral_for_memoir.strip(),
existing_for_narrative or "",
)
@@ -664,7 +708,7 @@ def run_story_pipeline_for_category_batch(
"event=story_generated route_type=single decision_source={} route_decision={} "
"unit_segments={} used_evidence={} narrative_json_valid={} fidelity_passed={} "
"fallback_type={} oral_len={} md_len={} chapter_category={} is_append={} "
"story_id={} seconds={:.3f}",
"story_id={} seconds={:.3f} oral_normalize_changed={}",
decision_source,
route.decision,
len(category_segments),
@@ -672,12 +716,13 @@ def run_story_pipeline_for_category_batch(
_is_json_narrative(raw_gen),
fb_gate == "none",
fallback_type,
len(combined_text.strip()),
len(om_norm),
len(md.strip()),
chapter_category,
is_append,
sid_log,
elapsed,
ct_raw != om_norm,
)
reorder_chapter_story_links_by_life_order_sync(session, str(chapter.id))