life-echo/api/app/features/evaluation/eval_trace_format.py

"""将证据闭包格式化为评审可读文本，并记录截断/丢弃区块（可审计）。"""

from __future__ import annotations

from app.features.conversation.models import Segment
from app.features.evaluation.eval_trace_schemas import (
    ChapterEvidenceBundle,
    EvidenceFormatMeta,
    FormattedMemoirEvidence,
    StoryEvidenceBundle,
)
from app.features.memory.models import (
    MemoryChunk,
    MemoryFact,
    MemorySummary,
    TimelineEvent,
)

# 与 judge_service._MEMOIR_EVIDENCE_MAX 对齐：访谈与结构化证据分预算，避免总长失控
_MEMOIR_TRANSCRIPT_CAP = 12_000
_MEMOIR_STRUCTURED_CAP = 12_000


def _approx_tokens(chars: int) -> int:
    return max(0, chars // 4)


def _segment_message_id_header(seg: Segment) -> str:
    um: str | None = None
    am: str | None = None
    lj = getattr(seg, "lineage_json", None)
    if isinstance(lj, dict):
        turns = lj.get("turns")
        if isinstance(turns, list) and turns:
            t0 = turns[0]
            if isinstance(t0, dict):
                um = str(t0.get("user_message_id") or "").strip() or None
                am = str(t0.get("assistant_message_id") or "").strip() or None
    if um is None:
        raw_um = getattr(seg, "user_message_id", None)
        if raw_um:
            um = str(raw_um)
    parts: list[str] = []
    if um:
        parts.append(f"user_msg={um}")
    if am:
        parts.append(f"assistant_msg={am}")
    return " · ".join(parts) if parts else ""


def build_segment_transcript(
    segments: list[Segment],
    ai_by_segment: dict[str, str],
) -> str:
    """按 segment 绑定的局部访谈块（用户句 + AI 回复）。"""
    blocks: list[str] = []
    for i, seg in enumerate(segments, start=1):
        uid = str(seg.id)
        user_txt = (seg.user_input_text or "").strip()
        ai_txt = (ai_by_segment.get(uid) or seg.agent_response or "").strip()
        id_extra = _segment_message_id_header(seg)
        head = (
            f"### Segment {i} · id={uid} · conversation={seg.conversation_id}"
            + (f" · {id_extra}" if id_extra else "")
        )
        body_u = f"用户: {user_txt}" if user_txt else "用户: （空）"
        body_a = f"AI: {ai_txt}" if ai_txt else "AI: （无日志/无 agent_response）"
        blocks.append(f"{head}\n{body_u}\n{body_a}")
    return "\n\n".join(blocks)


def build_structured_evidence_text(
    *,
    chunks: list[MemoryChunk],
    facts: list[MemoryFact],
    events: list[TimelineEvent],
    summaries: list[MemorySummary],
    max_chars: int = _MEMOIR_STRUCTURED_CAP,
) -> tuple[str, bool, list[str]]:
    """
    结构化记忆证据块；返回 (text, truncated, dropped_section_tags)。
    """
    parts: list[str] = []
    dropped: list[str] = []
    used = 0
    truncated = False

    def _add_section(title: str, body: str) -> None:
        nonlocal used, truncated
        block = f"{title}\n{body}".strip()
        if not block:
            return
        if used + len(block) + 2 > max_chars:
            truncated = True
            dropped.append(title.strip("【】").split("·")[0].strip())
            return
        parts.append(block)
        used += len(block) + 2

    if chunks:
        lines = []
        for c in chunks:
            snippet = (c.content or "").strip()
            if len(snippet) > 1200:
                snippet = snippet[:1200] + "…"
            lines.append(f"- chunk `{c.id}`: {snippet}")
        _add_section("【记忆片段 chunks】", "\n".join(lines))
    if facts:
        lines = []
        for f in facts:
            subj = (f.subject or "").strip()
            pred = (f.predicate or "").strip()
            lines.append(
                f"- fact `{f.id}` ({f.fact_type}): {subj} · {pred}".strip(" ·")
            )
        _add_section("【记忆事实 facts】", "\n".join(lines))
    if events:
        lines = []
        for e in events:
            lines.append(
                f"- timeline `{e.id}`: {e.title} ({e.event_year or e.event_date or ''})"
            )
            if e.description:
                desc = (e.description or "").strip()
                if len(desc) > 400:
                    desc = desc[:400] + "…"
                lines.append(f"  {desc}")
        _add_section("【时间线 timeline】", "\n".join(lines))
    if summaries:
        lines = []
        for s in summaries:
            body = (s.content or "").strip()
            if len(body) > 2000:
                body = body[:2000] + "…"
            lines.append(f"- summary `{s.id}` ({s.summary_type}): {body}")
        _add_section("【摘要 summaries】", "\n".join(lines))

    return "\n\n".join(parts).strip(), truncated, dropped


def evidence_summary_line(
    *,
    lineage_tier: str,
    segment_n: int,
    conv_n: int,
    chunk_n: int,
    fact_n: int,
    tl_n: int,
    sum_n: int,
    notes: list[str],
) -> str:
    bits = [
        f"tier={lineage_tier}",
        f"segments={segment_n}",
        f"conversations={conv_n}",
        f"chunks={chunk_n}",
        f"facts={fact_n}",
        f"timeline={tl_n}",
        f"summaries={sum_n}",
    ]
    if notes:
        bits.append("notes=" + "; ".join(notes[:3]))
    return "; ".join(bits)


def format_chapter_for_judge(
    bundle: ChapterEvidenceBundle,
    *,
    transcript: str,
    chunks: list[MemoryChunk],
    facts: list[MemoryFact],
    events: list[TimelineEvent],
    summaries: list[MemorySummary],
) -> FormattedMemoirEvidence:
    t_cap = _MEMOIR_TRANSCRIPT_CAP
    s_cap = _MEMOIR_STRUCTURED_CAP
    dropped: list[str] = []
    truncated = False

    t_in = transcript.strip()
    if len(t_in) > t_cap:
        truncated = True
        dropped.append("source_transcript_tail")
        t_in = t_in[:t_cap] + "\n\n…（原始对话证据已截断）"

    struct, s_trunc, s_drop = build_structured_evidence_text(
        chunks=chunks,
        facts=facts,
        events=events,
        summaries=summaries,
        max_chars=s_cap,
    )
    if s_trunc:
        truncated = True
        dropped.extend(s_drop)

    meta = EvidenceFormatMeta(
        truncated=truncated,
        dropped_sections=sorted(set(dropped)),
        included_token_estimate=_approx_tokens(len(t_in) + len(struct)),
        transcript_chars_included=len(t_in),
        structured_evidence_chars_included=len(struct),
    )
    summary = evidence_summary_line(
        lineage_tier=bundle.lineage_tier,
        segment_n=len(bundle.segment_ids),
        conv_n=len(bundle.conversation_ids),
        chunk_n=len(bundle.memory_chunk_ids),
        fact_n=len(bundle.memory_fact_ids),
        tl_n=len(bundle.timeline_event_ids),
        sum_n=len(bundle.summary_ids),
        notes=bundle.notes,
    )
    return FormattedMemoirEvidence(
        source_transcript=t_in,
        structured_evidence=struct,
        format_meta=meta,
        evidence_summary=summary,
    )


def format_story_for_judge(
    bundle: StoryEvidenceBundle,
    *,
    transcript: str,
    chunks: list[MemoryChunk],
    facts: list[MemoryFact],
    events: list[TimelineEvent],
    summaries: list[MemorySummary],
) -> FormattedMemoirEvidence:
    t_cap = _MEMOIR_TRANSCRIPT_CAP
    s_cap = _MEMOIR_STRUCTURED_CAP
    dropped: list[str] = []
    truncated = False

    t_in = transcript.strip()
    if len(t_in) > t_cap:
        truncated = True
        dropped.append("source_transcript_tail")
        t_in = t_in[:t_cap] + "\n\n…（原始对话证据已截断）"

    struct, s_trunc, s_drop = build_structured_evidence_text(
        chunks=chunks,
        facts=facts,
        events=events,
        summaries=summaries,
        max_chars=s_cap,
    )
    if s_trunc:
        truncated = True
        dropped.extend(s_drop)

    meta = EvidenceFormatMeta(
        truncated=truncated,
        dropped_sections=sorted(set(dropped)),
        included_token_estimate=_approx_tokens(len(t_in) + len(struct)),
        transcript_chars_included=len(t_in),
        structured_evidence_chars_included=len(struct),
    )
    summary = evidence_summary_line(
        lineage_tier=bundle.lineage_tier,
        segment_n=len(bundle.segment_ids),
        conv_n=len(bundle.conversation_ids),
        chunk_n=len(bundle.memory_chunk_ids),
        fact_n=len(bundle.memory_fact_ids),
        tl_n=len(bundle.timeline_event_ids),
        sum_n=len(bundle.summary_ids),
        notes=bundle.notes,
    )
    return FormattedMemoirEvidence(
        source_transcript=t_in,
        structured_evidence=struct,
        format_meta=meta,
        evidence_summary=summary,
    )