Files
life-echo/api/app/features/memory/evidence_format.py
2026-04-30 16:22:55 +08:00

200 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
将 MemoryService.retrieve / evidence bundle 格式化为 prompt 用短文本(叙事与访谈共用)。
"""
from __future__ import annotations
import json
import re
def _normalize_evidence_line(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip().lower())
def dedupe_evidence_chunk_rows(chunks: list) -> list:
"""
对 relevant_chunks 做稳定去重:按归一化后长度降序 + 原下标,单遍包含判定;
复杂度 O(n log n);输出按原顺序中保留条目的相对顺序稳定。
"""
extracted: list[tuple[int, str, object]] = []
for i, c in enumerate(chunks):
content = (
c.get("content", "") if isinstance(c, dict) else getattr(c, "content", "")
)
t = (content or "").strip()
if not t:
continue
extracted.append((i, t, c))
if len(extracted) <= 1:
return [x[2] for x in extracted]
extracted.sort(
key=lambda x: (-len(_normalize_evidence_line(x[1])), x[0]),
)
kept_norms: list[str] = []
kept: list[tuple[int, object]] = []
for orig_idx, text, c in extracted:
n = _normalize_evidence_line(text)
dup = False
for kn in kept_norms:
if len(n) <= len(kn) and n in kn:
dup = True
break
if not dup:
kept_norms.append(n)
kept.append((orig_idx, c))
kept.sort(key=lambda x: x[0])
return [x[1] for x in kept]
def _flatten_object_json(obj_raw: object) -> str:
"""Extract readable text from fact object_json (may be dict, JSON string, or plain str)."""
if isinstance(obj_raw, dict):
return str(obj_raw.get("value", "")) or ", ".join(
f"{k}={v}" for k, v in obj_raw.items() if v
)
if isinstance(obj_raw, str):
s = obj_raw.strip()
if s.startswith("{"):
try:
parsed = json.loads(s)
if isinstance(parsed, dict):
return str(parsed.get("value", s)) or s
except (json.JSONDecodeError, TypeError):
pass
return s
return str(obj_raw) if obj_raw else ""
def format_user_memory_for_chat_display(
text: str,
*,
verbatim: bool = False,
) -> str:
"""给聊天态的记忆文本加清晰归属,不改写原内容本身。"""
t = (text or "").strip()
if not t:
return ""
if verbatim:
return f"用户曾说:「{t}"
return f"关于用户:{t}"
def format_evidence_chunks_for_chat_prompt(evidence: dict) -> str:
"""聊天访谈专用:将检索 bundle 格式化为带编号引用与安全说明的短文本."""
chunks = evidence.get("relevant_chunks") or []
chunks = dedupe_evidence_chunk_rows(chunks[:10])
summaries = evidence.get("relevant_summaries") or []
facts = evidence.get("relevant_facts") or []
stories = evidence.get("relevant_stories") or []
header = (
"【相关记忆摘录·聊天专用】\n"
"以下编号条目均来自**用户过往口述或系统摘要****不是**助手本人经历。\n"
"承接时**必须**用「你之前提过…」「你说过…」「你刚讲到…」等**归因式**引用;\n"
"**禁止**改写成「我当时…」「我小时候…」「我演过…」等助手第一人称亲历口吻;"
"**禁止**把条目当作你与用户的共同回忆或无归因复述。\n"
)
lines: list[str] = []
n = 0
for c in chunks:
content = (
c.get("content", "") if isinstance(c, dict) else getattr(c, "content", "")
)
raw = (content or "").strip()
if not raw:
continue
n += 1
cid = ""
if isinstance(c, dict) and c.get("id"):
cid = str(c.get("id", ""))[:12]
label = f"[M{n}]" + (f"(id…{cid})" if cid else "")
safe = format_user_memory_for_chat_display(raw, verbatim=True)
lines.append(f"{label} {safe}")
for s in summaries[:3]:
if isinstance(s, dict):
st = (s.get("content") or "").strip()
stype = (s.get("summary_type") or "").strip()
if not st:
continue
n += 1
prefix = f"[摘要:{stype}]" if stype else "[摘要]"
safe = format_user_memory_for_chat_display(f"{prefix} {st}")
lines.append(f"[M{n}] {safe}")
for f in facts[:5]:
if isinstance(f, dict):
subj = f.get("subject", "")
pred = f.get("predicate", "")
obj_raw = f.get("object_json", "")
obj = _flatten_object_json(obj_raw)
if not (subj or pred):
continue
n += 1
fact_line = (
f"{subj}{pred}{obj}" if obj else f"{subj}{pred}"
)
safe = format_user_memory_for_chat_display(fact_line)
lines.append(f"[M{n}] {safe}")
for st in stories[:3]:
if isinstance(st, dict):
title = (st.get("title") or "").strip()
summ = (st.get("summary") or "").strip()
if not (title or summ):
continue
n += 1
safe = format_user_memory_for_chat_display(
" ".join(x for x in (title, summ) if x)
)
lines.append(f"[M{n}] {safe}")
if not lines:
return ""
return header + "\n".join(lines)
def format_evidence_chunks_for_prompt(evidence: dict) -> str:
"""将 MemoryService.retrieve 结果格式化为简短文本,供叙事与访谈 prompt 使用."""
chunks = evidence.get("relevant_chunks") or []
chunks = dedupe_evidence_chunk_rows(chunks[:10])
summaries = evidence.get("relevant_summaries") or []
facts = evidence.get("relevant_facts") or []
stories = evidence.get("relevant_stories") or []
parts: list[str] = []
for c in chunks:
content = (
c.get("content", "") if isinstance(c, dict) else getattr(c, "content", "")
)
if content:
parts.append(content.strip())
for s in summaries[:3]:
if isinstance(s, dict):
st = (s.get("content") or "").strip()
stype = (s.get("summary_type") or "").strip()
if st:
label = f"[摘要:{stype}]" if stype else "[摘要]"
parts.append(f"{label} {st}")
for f in facts[:5]:
if isinstance(f, dict):
subj = f.get("subject", "")
pred = f.get("predicate", "")
obj_raw = f.get("object_json", "")
obj = _flatten_object_json(obj_raw)
if subj or pred:
if obj:
parts.append(f"{subj}{pred}{obj}")
else:
parts.append(f"{subj}{pred}")
else:
parts.append(f"{getattr(f, 'subject', '')}{getattr(f, 'predicate', '')}")
for st in stories[:3]:
if isinstance(st, dict):
title = (st.get("title") or "").strip()
summ = (st.get("summary") or "").strip()
if title or summ:
parts.append(" ".join(x for x in (title, summ) if x))
return "\n\n".join(parts) if parts else ""