Route all memory ingest/retrieve/enrichment/compaction through async MemoryService. Remove legacy sync memory implementations (ingest/retrieve/compaction); Celery and memoir Phase2 call asyncio.run into MemoryService-backed helpers. Memoir Phase1 batch ingest uses MemoryService.ingest_transcripts_batch; drop chapters. evidence_bundle_json mirror (Alembic 0015). Evaluation uses snapshot/link-only bundles; raise EvidenceClosureMissing instead of partial/fallback lineage tiers. Split memoir state into NarrativeCoverageState and InterviewControlState; delete the _interview_meta_store adapter layer. Remove rolling-query and recent-fact fallback settings from config and evidence assembly. Update judges, docs, tests, and PlaygroundPage alignment. Made-with: Cursor
272 lines
8.3 KiB
Python
272 lines
8.3 KiB
Python
"""将证据闭包格式化为评审可读文本,并记录截断/丢弃区块(可审计)。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from app.core.config import settings
|
||
from app.features.conversation.models import Segment
|
||
from app.features.evaluation.eval_trace_schemas import (
|
||
ChapterEvidenceBundle,
|
||
EvidenceFormatMeta,
|
||
FormattedMemoirEvidence,
|
||
StoryEvidenceBundle,
|
||
)
|
||
from app.features.memory.models import (
|
||
MemoryChunk,
|
||
MemoryFact,
|
||
MemorySummary,
|
||
TimelineEvent,
|
||
)
|
||
|
||
|
||
def _memoir_evidence_char_cap() -> int:
|
||
"""与 ``Settings.eval_judge_memoir_evidence_max_chars`` 对齐。"""
|
||
return max(1000, int(settings.eval_judge_memoir_evidence_max_chars))
|
||
|
||
|
||
def _approx_tokens(chars: int) -> int:
|
||
return max(0, chars // 4)
|
||
|
||
|
||
def _segment_message_id_header(seg: Segment) -> str:
|
||
um: str | None = None
|
||
am: str | None = None
|
||
lj = getattr(seg, "lineage_json", None)
|
||
if isinstance(lj, dict):
|
||
turns = lj.get("turns")
|
||
if isinstance(turns, list) and turns:
|
||
t0 = turns[0]
|
||
if isinstance(t0, dict):
|
||
um = str(t0.get("user_message_id") or "").strip() or None
|
||
am = str(t0.get("assistant_message_id") or "").strip() or None
|
||
if um is None:
|
||
raw_um = getattr(seg, "user_message_id", None)
|
||
if raw_um:
|
||
um = str(raw_um)
|
||
parts: list[str] = []
|
||
if um:
|
||
parts.append(f"user_msg={um}")
|
||
if am:
|
||
parts.append(f"assistant_msg={am}")
|
||
return " · ".join(parts) if parts else ""
|
||
|
||
|
||
def build_segment_transcript(
|
||
segments: list[Segment],
|
||
ai_by_segment: dict[str, str],
|
||
) -> str:
|
||
"""按 segment 绑定的局部访谈块(用户句 + AI 回复)。"""
|
||
blocks: list[str] = []
|
||
for i, seg in enumerate(segments, start=1):
|
||
uid = str(seg.id)
|
||
user_txt = (seg.user_input_text or "").strip()
|
||
ai_txt = (ai_by_segment.get(uid) or seg.agent_response or "").strip()
|
||
id_extra = _segment_message_id_header(seg)
|
||
head = f"### Segment {i} · id={uid} · conversation={seg.conversation_id}" + (
|
||
f" · {id_extra}" if id_extra else ""
|
||
)
|
||
body_u = f"用户: {user_txt}" if user_txt else "用户: (空)"
|
||
body_a = f"AI: {ai_txt}" if ai_txt else "AI: (无日志/无 agent_response)"
|
||
blocks.append(f"{head}\n{body_u}\n{body_a}")
|
||
return "\n\n".join(blocks)
|
||
|
||
|
||
def build_structured_evidence_text(
|
||
*,
|
||
chunks: list[MemoryChunk],
|
||
facts: list[MemoryFact],
|
||
events: list[TimelineEvent],
|
||
summaries: list[MemorySummary],
|
||
max_chars: int | None = None,
|
||
) -> tuple[str, bool, list[str]]:
|
||
"""
|
||
结构化记忆证据块;返回 (text, truncated, dropped_section_tags)。
|
||
"""
|
||
cap = max_chars if max_chars is not None else _memoir_evidence_char_cap()
|
||
parts: list[str] = []
|
||
dropped: list[str] = []
|
||
used = 0
|
||
truncated = False
|
||
|
||
def _add_section(title: str, body: str) -> None:
|
||
nonlocal used, truncated
|
||
block = f"{title}\n{body}".strip()
|
||
if not block:
|
||
return
|
||
if used + len(block) + 2 > cap:
|
||
truncated = True
|
||
dropped.append(title.strip("【】").split("·")[0].strip())
|
||
return
|
||
parts.append(block)
|
||
used += len(block) + 2
|
||
|
||
if chunks:
|
||
lines = []
|
||
for c in chunks:
|
||
snippet = (c.content or "").strip()
|
||
if len(snippet) > 1200:
|
||
snippet = snippet[:1200] + "…"
|
||
lines.append(f"- chunk `{c.id}`: {snippet}")
|
||
_add_section("【记忆片段 chunks】", "\n".join(lines))
|
||
if facts:
|
||
lines = []
|
||
for f in facts:
|
||
subj = (f.subject or "").strip()
|
||
pred = (f.predicate or "").strip()
|
||
lines.append(
|
||
f"- fact `{f.id}` ({f.fact_type}): {subj} · {pred}".strip(" ·")
|
||
)
|
||
_add_section("【记忆事实 facts】", "\n".join(lines))
|
||
if events:
|
||
lines = []
|
||
for e in events:
|
||
lines.append(
|
||
f"- timeline `{e.id}`: {e.title} ({e.event_year or e.event_date or ''})"
|
||
)
|
||
if e.description:
|
||
desc = (e.description or "").strip()
|
||
if len(desc) > 400:
|
||
desc = desc[:400] + "…"
|
||
lines.append(f" {desc}")
|
||
_add_section("【时间线 timeline】", "\n".join(lines))
|
||
if summaries:
|
||
lines = []
|
||
for s in summaries:
|
||
body = (s.content or "").strip()
|
||
if len(body) > 2000:
|
||
body = body[:2000] + "…"
|
||
lines.append(f"- summary `{s.id}` ({s.summary_type}): {body}")
|
||
_add_section("【摘要 summaries】", "\n".join(lines))
|
||
|
||
return "\n\n".join(parts).strip(), truncated, dropped
|
||
|
||
|
||
def evidence_summary_line(
|
||
*,
|
||
segment_n: int,
|
||
conv_n: int,
|
||
chunk_n: int,
|
||
fact_n: int,
|
||
tl_n: int,
|
||
sum_n: int,
|
||
notes: list[str],
|
||
) -> str:
|
||
bits = [
|
||
f"segments={segment_n}",
|
||
f"conversations={conv_n}",
|
||
f"chunks={chunk_n}",
|
||
f"facts={fact_n}",
|
||
f"timeline={tl_n}",
|
||
f"summaries={sum_n}",
|
||
]
|
||
if notes:
|
||
bits.append("notes=" + "; ".join(notes[:3]))
|
||
return "; ".join(bits)
|
||
|
||
|
||
def format_chapter_for_judge(
|
||
bundle: ChapterEvidenceBundle,
|
||
*,
|
||
transcript: str,
|
||
chunks: list[MemoryChunk],
|
||
facts: list[MemoryFact],
|
||
events: list[TimelineEvent],
|
||
summaries: list[MemorySummary],
|
||
) -> FormattedMemoirEvidence:
|
||
ev_cap = _memoir_evidence_char_cap()
|
||
dropped: list[str] = []
|
||
truncated = False
|
||
|
||
t_in = transcript.strip()
|
||
if len(t_in) > ev_cap:
|
||
truncated = True
|
||
dropped.append("source_transcript_tail")
|
||
t_in = t_in[:ev_cap] + "\n\n…(原始对话证据已截断)"
|
||
|
||
struct, s_trunc, s_drop = build_structured_evidence_text(
|
||
chunks=chunks,
|
||
facts=facts,
|
||
events=events,
|
||
summaries=summaries,
|
||
max_chars=ev_cap,
|
||
)
|
||
if s_trunc:
|
||
truncated = True
|
||
dropped.extend(s_drop)
|
||
|
||
meta = EvidenceFormatMeta(
|
||
truncated=truncated,
|
||
dropped_sections=sorted(set(dropped)),
|
||
included_token_estimate=_approx_tokens(len(t_in) + len(struct)),
|
||
transcript_chars_included=len(t_in),
|
||
structured_evidence_chars_included=len(struct),
|
||
)
|
||
summary = evidence_summary_line(
|
||
segment_n=len(bundle.segment_ids),
|
||
conv_n=len(bundle.conversation_ids),
|
||
chunk_n=len(bundle.memory_chunk_ids),
|
||
fact_n=len(bundle.memory_fact_ids),
|
||
tl_n=len(bundle.timeline_event_ids),
|
||
sum_n=len(bundle.summary_ids),
|
||
notes=bundle.notes,
|
||
)
|
||
return FormattedMemoirEvidence(
|
||
source_transcript=t_in,
|
||
structured_evidence=struct,
|
||
format_meta=meta,
|
||
evidence_summary=summary,
|
||
)
|
||
|
||
|
||
def format_story_for_judge(
|
||
bundle: StoryEvidenceBundle,
|
||
*,
|
||
transcript: str,
|
||
chunks: list[MemoryChunk],
|
||
facts: list[MemoryFact],
|
||
events: list[TimelineEvent],
|
||
summaries: list[MemorySummary],
|
||
) -> FormattedMemoirEvidence:
|
||
ev_cap = _memoir_evidence_char_cap()
|
||
dropped: list[str] = []
|
||
truncated = False
|
||
|
||
t_in = transcript.strip()
|
||
if len(t_in) > ev_cap:
|
||
truncated = True
|
||
dropped.append("source_transcript_tail")
|
||
t_in = t_in[:ev_cap] + "\n\n…(原始对话证据已截断)"
|
||
|
||
struct, s_trunc, s_drop = build_structured_evidence_text(
|
||
chunks=chunks,
|
||
facts=facts,
|
||
events=events,
|
||
summaries=summaries,
|
||
max_chars=ev_cap,
|
||
)
|
||
if s_trunc:
|
||
truncated = True
|
||
dropped.extend(s_drop)
|
||
|
||
meta = EvidenceFormatMeta(
|
||
truncated=truncated,
|
||
dropped_sections=sorted(set(dropped)),
|
||
included_token_estimate=_approx_tokens(len(t_in) + len(struct)),
|
||
transcript_chars_included=len(t_in),
|
||
structured_evidence_chars_included=len(struct),
|
||
)
|
||
summary = evidence_summary_line(
|
||
segment_n=len(bundle.segment_ids),
|
||
conv_n=len(bundle.conversation_ids),
|
||
chunk_n=len(bundle.memory_chunk_ids),
|
||
fact_n=len(bundle.memory_fact_ids),
|
||
tl_n=len(bundle.timeline_event_ids),
|
||
sum_n=len(bundle.summary_ids),
|
||
notes=bundle.notes,
|
||
)
|
||
return FormattedMemoirEvidence(
|
||
source_transcript=t_in,
|
||
structured_evidence=struct,
|
||
format_meta=meta,
|
||
evidence_summary=summary,
|
||
)
|