Files
life-echo/api/app/features/memory/evidence_format.py
Kevin 69a673e6c6 feat(api): 访谈人格/回复长度策略、口述归一、背景语气与输入净稿全链路
Chat 访谈
- 新增 persona 系统(default / warm_listener / curious_guide)与 background_voice 语气层
- 回复长度由 compute_reply_plan 统一决策(brief / standard / expanded),融合信息密度启发式
- 输入净稿(input_normalize):编排层可选 rules/llm 归一用户口语后再喂模型与记忆检索
- 记忆证据注入:按用户话检索 memory evidence 并注入 prompt

Memoir 回忆录
- 口述归一(oral_normalize):segment 原文保留,story 管线取派生净稿作叙事输入
- segment 入队批次门闸:累计字数 + 最长等待秒数,减少零碎提交
- fidelity_check / prompts / narrative_agent 微调
- Alembic 0005:清理跨章节 story 外键

Infra
- Dockerfile 加入 ffmpeg
- pyproject.toml 新增依赖并同步 uv.lock
- .env.example / .env.production 补全新配置项

Tests
- 新增 test_background_voice、test_chat_input_normalize、test_experience_regressions
- 扩展 test_interview_prompts、test_interview_reply_length、test_story_route_oral_invariant

Made-with: Cursor
2026-03-31 23:55:26 +08:00

100 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
将 MemoryService.retrieve / evidence bundle 格式化为 prompt 用短文本(叙事与访谈共用)。
"""
from __future__ import annotations
import re
def _normalize_evidence_line(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip().lower())
def dedupe_evidence_chunk_rows(chunks: list) -> list:
"""
对 relevant_chunks 做稳定去重:按归一化后长度降序 + 原下标,单遍包含判定;
复杂度 O(n log n);输出按原顺序中保留条目的相对顺序稳定。
"""
extracted: list[tuple[int, str, object]] = []
for i, c in enumerate(chunks):
content = (
c.get("content", "") if isinstance(c, dict) else getattr(c, "content", "")
)
t = (content or "").strip()
if not t:
continue
extracted.append((i, t, c))
if len(extracted) <= 1:
return [x[2] for x in extracted]
extracted.sort(
key=lambda x: (-len(_normalize_evidence_line(x[1])), x[0]),
)
kept_norms: list[str] = []
kept: list[tuple[int, object]] = []
for orig_idx, text, c in extracted:
n = _normalize_evidence_line(text)
dup = False
for kn in kept_norms:
if len(n) <= len(kn) and n in kn:
dup = True
break
if not dup:
kept_norms.append(n)
kept.append((orig_idx, c))
kept.sort(key=lambda x: x[0])
return [x[1] for x in kept]
def format_evidence_chunks_for_prompt(evidence: dict) -> str:
"""将 retrieve_evidence / retrieve_evidence_sync 结果格式化为简短文本,供叙事与访谈 prompt 使用。
包含 chunks、摘要若有、confirmed facts、timeline、故事摘要若有
"""
chunks = evidence.get("relevant_chunks") or []
chunks = dedupe_evidence_chunk_rows(chunks[:10])
summaries = evidence.get("relevant_summaries") or []
facts = evidence.get("relevant_facts") or []
timeline = evidence.get("timeline_hints") or []
stories = evidence.get("relevant_stories") or []
parts: list[str] = []
for c in chunks:
content = (
c.get("content", "") if isinstance(c, dict) else getattr(c, "content", "")
)
if content:
parts.append(content.strip())
for s in summaries[:3]:
if isinstance(s, dict):
st = (s.get("content") or "").strip()
stype = (s.get("summary_type") or "").strip()
if st:
label = f"[摘要:{stype}]" if stype else "[摘要]"
parts.append(f"{label} {st}")
for f in facts[:5]:
if isinstance(f, dict):
subj = f.get("subject", "")
pred = f.get("predicate", "")
obj = f.get("object_json", "")
if subj or pred:
parts.append(f"{subj} {pred} {obj}")
else:
parts.append(f"{getattr(f, 'subject', '')} {getattr(f, 'predicate', '')}")
for t in timeline[:5]:
if isinstance(t, dict):
title = (t.get("title") or "").strip()
year = t.get("event_year")
desc = (t.get("description") or "").strip()
line = " ".join(
x for x in (str(year) if year is not None else "", title, desc) if x
)
if line:
parts.append(line)
for st in stories[:3]:
if isinstance(st, dict):
title = (st.get("title") or "").strip()
summ = (st.get("summary") or "").strip()
if title or summ:
parts.append(" ".join(x for x in (title, summ) if x))
return "\n\n".join(parts) if parts else ""