Files
life-echo/api/app/features/evaluation/eval_trace_service.py
yangshilin e1341c6d18 feat:
1. 建立问题库大纲,对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性
2026-04-09 15:32:35 +08:00

555 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""组装 Chapter/Story 评测证据闭包并格式化为评审输入。"""
from __future__ import annotations
from typing import Literal
from sqlalchemy.ext.asyncio import AsyncSession
from app.features.conversation import repo as conversation_repo
from app.features.conversation.lineage_schemas import aggregate_lineage_from_segments
from app.features.evaluation.eval_trace_format import (
build_segment_transcript,
format_chapter_for_judge,
format_story_for_judge,
)
from app.features.evaluation.eval_trace_repo import (
fetch_ai_messages_for_segments,
fetch_memory_closure_for_conversations,
fetch_segments_for_user,
get_chapter_for_eval_trace,
get_story_for_eval_trace,
list_chapter_ids_for_story,
load_chunks_by_ids,
load_facts_by_ids,
load_summaries_by_ids,
load_timeline_by_ids,
normalize_source_segment_ids,
story_link_ids_by_type,
)
from app.features.evaluation.eval_trace_schemas import (
ChapterEvidenceBundle,
FormattedMemoirEvidence,
StoryEvidenceBundle,
)
from app.features.memoir.chapter_evidence_snapshot import (
EVIDENCE_SNAPSHOT_SCHEMA_VERSION,
)
from app.features.memoir.models import Chapter
from app.features.story.models import Story, StoryVersion
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
def _segments_in_order(segments: list, segment_ids: list[str]) -> list:
order = {str(sid): i for i, sid in enumerate(segment_ids)}
return sorted(segments, key=lambda s: order.get(str(s.id), 9999))
def _trim_fallback_transcript(text: str) -> str:
s = (text or "").strip()
if len(s) <= _MAX_EVIDENCE_TRANSCRIPT_CHARS:
return s
return f"{s[:_MAX_EVIDENCE_TRANSCRIPT_CHARS]}\n\n…(访谈证据已截断)"
async def fallback_user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
"""legacy最近若干会话全文仅作 fallback调用方须声明 tier=fallback"""
conversations = await conversation_repo.get_user_conversations(user_id, db)
if not conversations:
return ""
parts: list[str] = []
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
rows = await conversation_repo.get_conversation_messages(str(conv.id), db)
blocks: list[str] = []
for row in rows:
role = str(row.role or "").lower()
body = (row.content or "").strip()
if not body:
continue
label = "用户" if role == "human" else "AI"
blocks.append(f"{label}: {body}")
transcript = "\n\n".join(blocks)
if transcript:
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
return _trim_fallback_transcript("\n\n".join(parts))
class EvalTraceService:
def __init__(self, db: AsyncSession) -> None:
self._db = db
async def _story_dialogue_lineage(
self,
st: Story,
segments: list,
segment_ids_ordered: list[str],
) -> dict | None:
if getattr(st, "current_version_id", None):
ver = await self._db.get(StoryVersion, st.current_version_id)
if ver and isinstance(getattr(ver, "lineage_json", None), dict):
lj = ver.lineage_json
if lj.get("turns"):
return lj
if segments and segment_ids_ordered:
ordered = _segments_in_order(segments, segment_ids_ordered)
conv_ids = sorted(
{str(s.conversation_id) for s in segments if s.conversation_id}
)
return aggregate_lineage_from_segments(
ordered, conversation_id_fallback=conv_ids[0] if conv_ids else None
)
return None
def _chapter_closure_tier(
self,
*,
segment_ids_resolved: list[str],
chunk_ids: list[str],
fact_ids: list[str],
tl_ids: list[str],
sum_ids: list[str],
) -> Literal["strict", "partial", "fallback"]:
has_seg = bool(segment_ids_resolved)
has_mem = bool(chunk_ids or fact_ids or tl_ids or sum_ids)
if has_seg and has_mem:
return "strict"
if has_seg:
return "partial"
if has_mem:
return "partial"
return "fallback"
async def build_chapter_bundle(
self, user_id: str, chapter: Chapter
) -> ChapterEvidenceBundle:
notes: list[str] = []
live_segment_ids = normalize_source_segment_ids(
getattr(chapter, "source_segments", None)
)
row = getattr(chapter, "current_evidence_snapshot", None)
row_has_closure = bool(
(row and (row.segment_ids or []))
or (
row
and (
row.memory_chunk_ids
or row.memory_fact_ids
or row.timeline_event_ids
or row.summary_ids
)
)
)
if (
row is not None
and str(row.user_id) == str(user_id)
and str(row.chapter_id) == str(chapter.id)
and int(row.schema_version or 0) == EVIDENCE_SNAPSHOT_SCHEMA_VERSION
and row_has_closure
):
segment_ids = [str(x) for x in (row.segment_ids or []) if str(x).strip()]
conv_ids = sorted(
{str(x) for x in (row.conversation_ids or []) if str(x).strip()}
)
chunk_ids = [str(x) for x in (row.memory_chunk_ids or []) if str(x).strip()]
fact_ids = [str(x) for x in (row.memory_fact_ids or []) if str(x).strip()]
tl_ids = [str(x) for x in (row.timeline_event_ids or []) if str(x).strip()]
sum_ids = [str(x) for x in (row.summary_ids or []) if str(x).strip()]
notes.extend([str(x) for x in (row.notes or []) if x])
notes.append("evidence_from_chapter_evidence_snapshot_table")
tier = self._chapter_closure_tier(
segment_ids_resolved=segment_ids,
chunk_ids=chunk_ids,
fact_ids=fact_ids,
tl_ids=tl_ids,
sum_ids=sum_ids,
)
if live_segment_ids and set(live_segment_ids) != set(segment_ids):
notes.append(
"live_source_segments_differ_from_snapshot_reconcile_in_pipeline"
)
dlg = getattr(row, "message_lineage_json", None)
return ChapterEvidenceBundle(
user_id=user_id,
chapter_id=str(chapter.id),
segment_ids=segment_ids,
conversation_ids=conv_ids,
memory_chunk_ids=chunk_ids,
memory_fact_ids=fact_ids,
timeline_event_ids=tl_ids,
summary_ids=sum_ids,
lineage_tier=tier,
notes=notes,
dialogue_lineage=dlg if isinstance(dlg, dict) else None,
)
snap = getattr(chapter, "evidence_bundle_json", None)
snap_uid = str(snap.get("user_id") or "") if isinstance(snap, dict) else ""
snap_has_closure = bool(
(isinstance(snap, dict) and (snap.get("segment_ids") or []))
or (
isinstance(snap, dict)
and (
snap.get("memory_chunk_ids")
or snap.get("memory_fact_ids")
or snap.get("timeline_event_ids")
or snap.get("summary_ids")
)
)
)
use_snap = (
isinstance(snap, dict)
and int(snap.get("schema_version") or 0) == EVIDENCE_SNAPSHOT_SCHEMA_VERSION
and str(snap.get("chapter_id") or "") == str(chapter.id)
and (not snap_uid or snap_uid == str(user_id))
and snap_has_closure
)
if use_snap and isinstance(snap, dict):
segment_ids = [
str(x) for x in (snap.get("segment_ids") or []) if str(x).strip()
]
conv_ids = sorted(
{str(x) for x in (snap.get("conversation_ids") or []) if str(x).strip()}
)
chunk_ids = [
str(x) for x in (snap.get("memory_chunk_ids") or []) if str(x).strip()
]
fact_ids = [
str(x) for x in (snap.get("memory_fact_ids") or []) if str(x).strip()
]
tl_ids = [
str(x) for x in (snap.get("timeline_event_ids") or []) if str(x).strip()
]
sum_ids = [
str(x) for x in (snap.get("summary_ids") or []) if str(x).strip()
]
notes.extend([str(x) for x in (snap.get("notes") or []) if x])
notes.append("evidence_from_chapter_evidence_bundle_json_column")
tier = self._chapter_closure_tier(
segment_ids_resolved=segment_ids,
chunk_ids=chunk_ids,
fact_ids=fact_ids,
tl_ids=tl_ids,
sum_ids=sum_ids,
)
if live_segment_ids and set(live_segment_ids) != set(segment_ids):
notes.append(
"live_source_segments_differ_from_snapshot_reconcile_in_pipeline"
)
snap_dlg = (
snap.get("message_lineage_json") if isinstance(snap, dict) else None
)
return ChapterEvidenceBundle(
user_id=user_id,
chapter_id=str(chapter.id),
segment_ids=segment_ids,
conversation_ids=conv_ids,
memory_chunk_ids=chunk_ids,
memory_fact_ids=fact_ids,
timeline_event_ids=tl_ids,
summary_ids=sum_ids,
lineage_tier=tier,
notes=notes,
dialogue_lineage=snap_dlg if isinstance(snap_dlg, dict) else None,
)
segment_ids = live_segment_ids
if not segment_ids:
notes.append("no_source_segments")
notes.append("fallback_lineage_transcript_pending")
return ChapterEvidenceBundle(
user_id=user_id,
chapter_id=str(chapter.id),
segment_ids=[],
conversation_ids=[],
lineage_tier="fallback",
notes=notes,
dialogue_lineage=None,
)
segments = await fetch_segments_for_user(
self._db, user_id=user_id, segment_ids=segment_ids
)
resolved_seg_ids = [s.id for s in segments] or segment_ids
if len(segments) < len(segment_ids):
notes.append("some_segments_missing_or_foreign_user")
conv_ids = sorted(
{str(s.conversation_id) for s in segments if s.conversation_id}
)
(
chunk_ids,
fact_ids,
tl_ids,
sum_ids,
) = await fetch_memory_closure_for_conversations(
self._db, user_id=user_id, conversation_ids=conv_ids
)
tier = self._chapter_closure_tier(
segment_ids_resolved=resolved_seg_ids,
chunk_ids=chunk_ids,
fact_ids=fact_ids,
tl_ids=tl_ids,
sum_ids=sum_ids,
)
if tier == "partial":
notes.append(
"chapter_source_segments_union_semantics=partial_lineage_until_snapshot"
)
elif tier == "strict":
notes.append("chapter_lineage_strict_segments_plus_memory_closure")
segs_ord = _segments_in_order(segments, resolved_seg_ids)
dlg_live = aggregate_lineage_from_segments(
segs_ord, conversation_id_fallback=conv_ids[0] if conv_ids else None
)
return ChapterEvidenceBundle(
user_id=user_id,
chapter_id=str(chapter.id),
segment_ids=resolved_seg_ids,
conversation_ids=conv_ids,
memory_chunk_ids=chunk_ids,
memory_fact_ids=fact_ids,
timeline_event_ids=tl_ids,
summary_ids=sum_ids,
lineage_tier=tier,
notes=notes,
dialogue_lineage=dlg_live,
)
async def format_chapter_bundle(
self, bundle: ChapterEvidenceBundle
) -> tuple[FormattedMemoirEvidence, ChapterEvidenceBundle]:
"""若 tier=fallback调用方应先将要并入 transcripts 写入 session此处只负责 segment 路径。"""
if bundle.lineage_tier == "fallback":
ft = await fallback_user_transcript_evidence(self._db, bundle.user_id)
notes = list(bundle.notes)
notes.append("used_legacy_recent_conversations_transcript")
bundle = bundle.model_copy(update={"notes": notes})
formatted = format_chapter_for_judge(
bundle,
transcript=ft,
chunks=[],
facts=[],
events=[],
summaries=[],
)
return formatted, bundle
segs = await fetch_segments_for_user(
self._db, user_id=bundle.user_id, segment_ids=bundle.segment_ids
)
ai_map = await fetch_ai_messages_for_segments(
self._db, user_id=bundle.user_id, segment_ids=[s.id for s in segs]
)
transcript = build_segment_transcript(segs, ai_map)
chunks = await load_chunks_by_ids(
self._db, user_id=bundle.user_id, chunk_ids=bundle.memory_chunk_ids
)
facts = await load_facts_by_ids(
self._db, user_id=bundle.user_id, fact_ids=bundle.memory_fact_ids
)
events = await load_timeline_by_ids(
self._db, user_id=bundle.user_id, event_ids=bundle.timeline_event_ids
)
summaries = await load_summaries_by_ids(
self._db, user_id=bundle.user_id, summary_ids=bundle.summary_ids
)
formatted = format_chapter_for_judge(
bundle,
transcript=transcript,
chunks=chunks,
facts=facts,
events=events,
summaries=summaries,
)
return formatted, bundle
async def build_story_bundle(
self, user_id: str, story_id: str
) -> StoryEvidenceBundle:
st = await get_story_for_eval_trace(
self._db, user_id=user_id, story_id=story_id
)
if not st:
return StoryEvidenceBundle(
user_id=user_id,
story_id=story_id,
lineage_tier="fallback",
notes=["story_not_found"],
dialogue_lineage=None,
)
links = list(st.evidence_links or [])
lc, lf, lt, ls = story_link_ids_by_type(links)
notes: list[str] = []
chapter_ids = await list_chapter_ids_for_story(
self._db, user_id=user_id, story_id=str(st.id)
)
if lc or lf or lt or ls:
# 结构化以 link 为准;会话级 transcript 尝试从挂靠章节 source_segments 收缩
seg_ids: list[str] = []
conv_ids: list[str] = []
for cid in chapter_ids:
ch = await get_chapter_for_eval_trace(
self._db, user_id=user_id, chapter_id=cid
)
if not ch:
continue
seg_ids.extend(normalize_source_segment_ids(ch.source_segments))
# 保序去重
seen_s: set[str] = set()
dedup_seg: list[str] = []
for s in seg_ids:
if s not in seen_s:
seen_s.add(s)
dedup_seg.append(s)
segments = await fetch_segments_for_user(
self._db, user_id=user_id, segment_ids=dedup_seg
)
conv_ids = sorted(
{str(s.conversation_id) for s in segments if s.conversation_id}
)
if dedup_seg and not segments:
notes.append("chapter_segment_ids_unresolved")
if conv_ids:
notes.append("transcript_from_chapter_source_segments")
else:
notes.append("no_chapter_segments_for_transcript_context")
bound_transcript = bool(segments)
story_tier: Literal["strict", "partial", "fallback"] = "strict"
if (lc or lf or lt or ls) and not bound_transcript:
notes.append("structured_evidence_without_bound_transcript")
story_tier = "partial"
dlg = await self._story_dialogue_lineage(st, segments, dedup_seg)
return StoryEvidenceBundle(
user_id=user_id,
story_id=str(st.id),
segment_ids=[s.id for s in segments] or dedup_seg,
conversation_ids=conv_ids,
memory_chunk_ids=lc,
memory_fact_ids=lf,
timeline_event_ids=lt,
summary_ids=ls,
lineage_tier=story_tier,
notes=notes,
augmented_with_chapter_context=bool(chapter_ids),
story_link_evidence_count=len(links),
fallback_chapter_ids=chapter_ids,
dialogue_lineage=dlg,
)
# 无 StoryEvidenceLink由章节 source_segments 推导 partial再不行则 fallback
seg_ids = []
conv_ids: list[str] = []
for cid in chapter_ids:
ch = await get_chapter_for_eval_trace(
self._db, user_id=user_id, chapter_id=cid
)
if not ch:
continue
seg_ids.extend(normalize_source_segment_ids(ch.source_segments))
seen_s = set()
dedup_seg = []
for s in seg_ids:
if s not in seen_s:
seen_s.add(s)
dedup_seg.append(s)
if dedup_seg:
segments = await fetch_segments_for_user(
self._db, user_id=user_id, segment_ids=dedup_seg
)
conv_ids = sorted(
{str(s.conversation_id) for s in segments if s.conversation_id}
)
(
chunk_ids,
fact_ids,
tl_ids,
sum_ids,
) = await fetch_memory_closure_for_conversations(
self._db, user_id=user_id, conversation_ids=conv_ids
)
notes.append("fallback_lineage_no_story_evidence_links")
notes.append("augmented_with_chapter_context")
dlg2 = await self._story_dialogue_lineage(st, segments, dedup_seg)
return StoryEvidenceBundle(
user_id=user_id,
story_id=str(st.id),
segment_ids=[s.id for s in segments] or dedup_seg,
conversation_ids=conv_ids,
memory_chunk_ids=chunk_ids,
memory_fact_ids=fact_ids,
timeline_event_ids=tl_ids,
summary_ids=sum_ids,
lineage_tier="partial",
notes=notes,
augmented_with_chapter_context=True,
story_link_evidence_count=0,
fallback_chapter_ids=chapter_ids,
dialogue_lineage=dlg2,
)
notes.append("no_story_evidence_links_and_no_chapter_segments")
notes.append("fallback_lineage_transcript_pending")
dlg3 = await self._story_dialogue_lineage(st, [], [])
return StoryEvidenceBundle(
user_id=user_id,
story_id=str(st.id),
lineage_tier="fallback",
notes=notes,
story_link_evidence_count=0,
fallback_chapter_ids=chapter_ids,
dialogue_lineage=dlg3,
)
async def format_story_bundle(
self, bundle: StoryEvidenceBundle
) -> tuple[FormattedMemoirEvidence, StoryEvidenceBundle]:
if bundle.lineage_tier == "fallback":
ft = await fallback_user_transcript_evidence(self._db, bundle.user_id)
notes = list(bundle.notes)
notes.append("used_legacy_recent_conversations_transcript")
bundle = bundle.model_copy(update={"notes": notes})
formatted = format_story_for_judge(
bundle,
transcript=ft,
chunks=[],
facts=[],
events=[],
summaries=[],
)
return formatted, bundle
segs = await fetch_segments_for_user(
self._db, user_id=bundle.user_id, segment_ids=bundle.segment_ids
)
ai_map = await fetch_ai_messages_for_segments(
self._db, user_id=bundle.user_id, segment_ids=[s.id for s in segs]
)
transcript = build_segment_transcript(segs, ai_map)
chunks = await load_chunks_by_ids(
self._db, user_id=bundle.user_id, chunk_ids=bundle.memory_chunk_ids
)
facts = await load_facts_by_ids(
self._db, user_id=bundle.user_id, fact_ids=bundle.memory_fact_ids
)
events = await load_timeline_by_ids(
self._db, user_id=bundle.user_id, event_ids=bundle.timeline_event_ids
)
summaries = await load_summaries_by_ids(
self._db, user_id=bundle.user_id, summary_ids=bundle.summary_ids
)
formatted = format_story_for_judge(
bundle,
transcript=transcript,
chunks=chunks,
facts=facts,
events=events,
summaries=summaries,
)
return formatted, bundle