Files
life-echo/api/app/features/evaluation/eval_trace_format.py
Sully 53e0065e3e refactor(api): TOML 配置 SSOT、统一错误契约、Auth/事务加固与可观测性 (#33)
配置 SSOT(TOML + .env)
统一错误契约
Auth 与事务边界
Redis / Celery 可靠性:业务 Redis(DB/0)与 Celery broker/backend(DB/1)显式拆分;连接池、sync client
可观测性(OpenTelemetry + LGTM)
2026-05-22 13:44:50 +08:00

273 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""将证据闭包格式化为评审可读文本,并记录截断/丢弃区块(可审计)。"""
from __future__ import annotations
from app.features.conversation.models import Segment
from app.features.evaluation.constants import eval_cfg
from app.features.evaluation.eval_trace_schemas import (
ChapterEvidenceBundle,
EvidenceFormatMeta,
FormattedMemoirEvidence,
StoryEvidenceBundle,
)
from app.features.memoir.constants import memoir
from app.features.memory.models import (
MemoryChunk,
MemoryFact,
MemorySummary,
TimelineEvent,
)
def _memoir_evidence_char_cap() -> int:
"""与 ``Settings.eval_judge_memoir_evidence_max_chars`` 对齐。"""
return max(1000, int(eval_cfg.judge_memoir_evidence_max_chars))
def _approx_tokens(chars: int) -> int:
return max(0, chars // 4)
def _segment_message_id_header(seg: Segment) -> str:
um: str | None = None
am: str | None = None
lj = getattr(seg, "lineage_json", None)
if isinstance(lj, dict):
turns = lj.get("turns")
if isinstance(turns, list) and turns:
t0 = turns[0]
if isinstance(t0, dict):
um = str(t0.get("user_message_id") or "").strip() or None
am = str(t0.get("assistant_message_id") or "").strip() or None
if um is None:
raw_um = getattr(seg, "user_message_id", None)
if raw_um:
um = str(raw_um)
parts: list[str] = []
if um:
parts.append(f"user_msg={um}")
if am:
parts.append(f"assistant_msg={am}")
return " · ".join(parts) if parts else ""
def build_segment_transcript(
segments: list[Segment],
ai_by_segment: dict[str, str],
) -> str:
"""按 segment 绑定的局部访谈块(用户句 + AI 回复)。"""
blocks: list[str] = []
for i, seg in enumerate(segments, start=1):
uid = str(seg.id)
user_txt = (seg.user_input_text or "").strip()
ai_txt = (ai_by_segment.get(uid) or seg.agent_response or "").strip()
id_extra = _segment_message_id_header(seg)
head = f"### Segment {i} · id={uid} · conversation={seg.conversation_id}" + (
f" · {id_extra}" if id_extra else ""
)
body_u = f"用户: {user_txt}" if user_txt else "用户: (空)"
body_a = f"AI: {ai_txt}" if ai_txt else "AI: (无日志/无 agent_response"
blocks.append(f"{head}\n{body_u}\n{body_a}")
return "\n\n".join(blocks)
def build_structured_evidence_text(
*,
chunks: list[MemoryChunk],
facts: list[MemoryFact],
events: list[TimelineEvent],
summaries: list[MemorySummary],
max_chars: int | None = None,
) -> tuple[str, bool, list[str]]:
"""
结构化记忆证据块;返回 (text, truncated, dropped_section_tags)。
"""
cap = max_chars if max_chars is not None else _memoir_evidence_char_cap()
parts: list[str] = []
dropped: list[str] = []
used = 0
truncated = False
def _add_section(title: str, body: str) -> None:
nonlocal used, truncated
block = f"{title}\n{body}".strip()
if not block:
return
if used + len(block) + 2 > cap:
truncated = True
dropped.append(title.strip("【】").split("·")[0].strip())
return
parts.append(block)
used += len(block) + 2
if chunks:
lines = []
for c in chunks:
snippet = (c.content or "").strip()
if len(snippet) > 1200:
snippet = snippet[:1200] + ""
lines.append(f"- chunk `{c.id}`: {snippet}")
_add_section("【记忆片段 chunks】", "\n".join(lines))
if facts:
lines = []
for f in facts:
subj = (f.subject or "").strip()
pred = (f.predicate or "").strip()
lines.append(
f"- fact `{f.id}` ({f.fact_type}): {subj} · {pred}".strip(" ·")
)
_add_section("【记忆事实 facts】", "\n".join(lines))
if events:
lines = []
for e in events:
lines.append(
f"- timeline `{e.id}`: {e.title} ({e.event_year or e.event_date or ''})"
)
if e.description:
desc = (e.description or "").strip()
if len(desc) > 400:
desc = desc[:400] + ""
lines.append(f" {desc}")
_add_section("【时间线 timeline】", "\n".join(lines))
if summaries:
lines = []
for s in summaries:
body = (s.content or "").strip()
if len(body) > 2000:
body = body[:2000] + ""
lines.append(f"- summary `{s.id}` ({s.summary_type}): {body}")
_add_section("【摘要 summaries】", "\n".join(lines))
return "\n\n".join(parts).strip(), truncated, dropped
def evidence_summary_line(
*,
segment_n: int,
conv_n: int,
chunk_n: int,
fact_n: int,
tl_n: int,
sum_n: int,
notes: list[str],
) -> str:
bits = [
f"segments={segment_n}",
f"conversations={conv_n}",
f"chunks={chunk_n}",
f"facts={fact_n}",
f"timeline={tl_n}",
f"summaries={sum_n}",
]
if notes:
bits.append("notes=" + "; ".join(notes[:3]))
return "; ".join(bits)
def format_chapter_for_judge(
bundle: ChapterEvidenceBundle,
*,
transcript: str,
chunks: list[MemoryChunk],
facts: list[MemoryFact],
events: list[TimelineEvent],
summaries: list[MemorySummary],
) -> FormattedMemoirEvidence:
ev_cap = _memoir_evidence_char_cap()
dropped: list[str] = []
truncated = False
t_in = transcript.strip()
if len(t_in) > ev_cap:
truncated = True
dropped.append("source_transcript_tail")
t_in = t_in[:ev_cap] + "\n\n…(原始对话证据已截断)"
struct, s_trunc, s_drop = build_structured_evidence_text(
chunks=chunks,
facts=facts,
events=events,
summaries=summaries,
max_chars=ev_cap,
)
if s_trunc:
truncated = True
dropped.extend(s_drop)
meta = EvidenceFormatMeta(
truncated=truncated,
dropped_sections=sorted(set(dropped)),
included_token_estimate=_approx_tokens(len(t_in) + len(struct)),
transcript_chars_included=len(t_in),
structured_evidence_chars_included=len(struct),
)
summary = evidence_summary_line(
segment_n=len(bundle.segment_ids),
conv_n=len(bundle.conversation_ids),
chunk_n=len(bundle.memory_chunk_ids),
fact_n=len(bundle.memory_fact_ids),
tl_n=len(bundle.timeline_event_ids),
sum_n=len(bundle.summary_ids),
notes=bundle.notes,
)
return FormattedMemoirEvidence(
source_transcript=t_in,
structured_evidence=struct,
format_meta=meta,
evidence_summary=summary,
)
def format_story_for_judge(
bundle: StoryEvidenceBundle,
*,
transcript: str,
chunks: list[MemoryChunk],
facts: list[MemoryFact],
events: list[TimelineEvent],
summaries: list[MemorySummary],
) -> FormattedMemoirEvidence:
ev_cap = _memoir_evidence_char_cap()
dropped: list[str] = []
truncated = False
t_in = transcript.strip()
if len(t_in) > ev_cap:
truncated = True
dropped.append("source_transcript_tail")
t_in = t_in[:ev_cap] + "\n\n…(原始对话证据已截断)"
struct, s_trunc, s_drop = build_structured_evidence_text(
chunks=chunks,
facts=facts,
events=events,
summaries=summaries,
max_chars=ev_cap,
)
if s_trunc:
truncated = True
dropped.extend(s_drop)
meta = EvidenceFormatMeta(
truncated=truncated,
dropped_sections=sorted(set(dropped)),
included_token_estimate=_approx_tokens(len(t_in) + len(struct)),
transcript_chars_included=len(t_in),
structured_evidence_chars_included=len(struct),
)
summary = evidence_summary_line(
segment_n=len(bundle.segment_ids),
conv_n=len(bundle.conversation_ids),
chunk_n=len(bundle.memory_chunk_ids),
fact_n=len(bundle.memory_fact_ids),
tl_n=len(bundle.timeline_event_ids),
sum_n=len(bundle.summary_ids),
notes=bundle.notes,
)
return FormattedMemoirEvidence(
source_transcript=t_in,
structured_evidence=struct,
format_meta=meta,
evidence_summary=summary,
)