feat:

1. 建立问题库大纲，对应每个人生阶段槽位 2. 鼓励使用更生活化的交流语言共情与总结 3. 降低评审模型可能发生截断的概率 4. 成稿质量维度强化情感表达和上下文连贯性
2026-04-09 15:32:35 +08:00
parent 064ad2161d
commit e1341c6d18
49 changed files with 938 additions and 271 deletions
--- a/api/app/features/conversation/lineage_schemas.py
+++ b/api/app/features/conversation/lineage_schemas.py
@@ -141,9 +141,9 @@ def aggregate_lineage_from_segments(
    """
    if not segments:
        return None
-    conv0 = conversation_id_fallback or getattr(
-        segments[0], "conversation_id", None
-    ) or ""
+    conv0 = (
+        conversation_id_fallback or getattr(segments[0], "conversation_id", None) or ""
+    )
    if not conv0:
        lj0 = getattr(segments[0], "lineage_json", None)
        if isinstance(lj0, dict) and lj0.get("conversation_id"):
--- a/api/app/features/conversation/models.py
+++ b/api/app/features/conversation/models.py
@@ -62,7 +62,9 @@ class Segment(Base):
    tts_audio_urls = Column(JSON, nullable=True)
    # 用户轮次 durable message id（与 lineage_json 同步；便于查询）
    user_message_id = Column(
-        String, ForeignKey("conversation_messages.id", ondelete="SET NULL"), nullable=True
+        String,
+        ForeignKey("conversation_messages.id", ondelete="SET NULL"),
+        nullable=True,
    )
    # DialogueLineage JSON（schema 见 conversation.lineage_schemas）
    lineage_json = Column(JSON, nullable=True)
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -523,9 +523,7 @@ async def process_audio_segment(
            if _is_transcribe_failure(transcript_text):
                detail = (transcript_text or "").strip()
                if not detail:
-                    user_msg = (
-                        f"分段 {segment_index} 未识别到语音内容，请重试或检查麦克风与网络"
-                    )
+                    user_msg = f"分段 {segment_index} 未识别到语音内容，请重试或检查麦克风与网络"
                else:
                    user_msg = f"分段 {segment_index} 语音识别失败，请稍后再试"
                await manager.send_message(
@@ -698,9 +696,7 @@ async def process_user_message(
            audio_duration_seconds=audio_dur,
            tts_audio_urls=None,
            segment_id=segment.id,
-            memory_retrieval_trace=getattr(
-                turn, "memory_retrieval_trace", None
-            ),
+            memory_retrieval_trace=getattr(turn, "memory_retrieval_trace", None),
        )
        if not turn_ids:
            logger.warning(
--- a/api/app/features/evaluation/conversation_compare_summary.py
+++ b/api/app/features/evaluation/conversation_compare_summary.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 from typing import Any

 from app.features.evaluation.judge_schemas import ConversationJudgeOutput
+from app.features.evaluation.judge_service import trim_compare_transcript_pair

 _GROUP_KEYS: tuple[tuple[str, str], ...] = (
    ("emotion_score", "情绪与陪伴"),
@@ -44,6 +45,32 @@ def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
    )


+def _evidence_quality(truncation: dict[str, Any]) -> dict[str, Any]:
+    """结构化说明「分数/对比文在多大程度上覆盖全量对话」，便于客观解读。"""
+    b_h = not truncation["baseline_truncated_for_conversation"]
+    r_h = not truncation["replay_truncated_for_conversation"]
+    pair_full = not (
+        truncation["baseline_truncated_for_compare"]
+        or truncation["replay_truncated_for_compare"]
+    )
+    if b_h and r_h and pair_full:
+        scope = "full"
+        note = "评分与 A/B 对比均基于当前注入的全文（在模型上下文内未再裁对话正文）。"
+    else:
+        scope = "partial"
+        note = (
+            "存在整段或对比环节截断：分数与流式结论仅反映已提交片段；"
+            "评审侧已注入截断边界说明，长程细项应保守。发布决策请结合逐轮分、人工抽查或更高上下文预算。"
+        )
+    return {
+        "scope": scope,
+        "baseline_holistic_covers_full_text": b_h,
+        "replay_holistic_covers_full_text": r_h,
+        "ab_compare_covers_full_transcripts": pair_full,
+        "note_zh": note,
+    }
+
+
 def build_conversation_compare_summary(
    *,
    baseline_judge: ConversationJudgeOutput | None,
@@ -51,29 +78,42 @@ def build_conversation_compare_summary(
    baseline_transcript: str,
    replay_transcript: str,
    conv_cap: int,
-    compare_cap_each: int,
+    compare_cap_total: int,
+    compare_per_side_cap: int | None = None,
    fixture_filename: str | None = None,
 ) -> dict[str, Any]:
+    _, _, baseline_cmp_trunc, replay_cmp_trunc = trim_compare_transcript_pair(
+        baseline_transcript,
+        replay_transcript,
+        total_max_chars=int(compare_cap_total),
+        per_side_max_chars=compare_per_side_cap,
+    )
+    if compare_per_side_cap and compare_per_side_cap > 0:
+        each_hint = int(compare_per_side_cap)
+    else:
+        each_hint = max(1, int(compare_cap_total) // 2)
+
    truncation = {
        "baseline_chars": len((baseline_transcript or "").strip()),
        "replay_chars": len((replay_transcript or "").strip()),
        "conversation_cap_chars": int(conv_cap),
-        "compare_cap_each_chars": int(compare_cap_each),
+        "compare_cap_total_chars": int(compare_cap_total),
+        "compare_cap_each_chars": each_hint,
        "baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
        > int(conv_cap),
        "replay_truncated_for_conversation": len((replay_transcript or "").strip())
        > int(conv_cap),
-        "baseline_truncated_for_compare": len((baseline_transcript or "").strip())
-        > int(compare_cap_each),
-        "replay_truncated_for_compare": len((replay_transcript or "").strip())
-        > int(compare_cap_each),
+        "baseline_truncated_for_compare": baseline_cmp_trunc,
+        "replay_truncated_for_compare": replay_cmp_trunc,
    }
+    evidence_quality = _evidence_quality(truncation)

    if not replay_judge:
        return {
            "fixture_filename": fixture_filename,
            "mode": "single",
            "truncation": truncation,
+            "evidence_quality": evidence_quality,
            "gate": {
                "status": "insufficient_data",
                "reasons": ["缺少回放整体评分，无法判断是否追平或超过 A。"],
@@ -86,9 +126,12 @@ def build_conversation_compare_summary(
            "mode": "single",
            "replay_total": _round(replay_judge.total_score),
            "truncation": truncation,
+            "evidence_quality": evidence_quality,
            "gate": {
                "status": "single_side_only",
-                "reasons": ["当前只有新对话单侧评分，可用于优化，但不能判定是否超过 A。"],
+                "reasons": [
+                    "当前只有新对话单侧评分，可用于优化，但不能判定是否超过 A。"
+                ],
            },
        }

@@ -150,8 +193,20 @@ def build_conversation_compare_summary(
        reasons.append(f"关键回落维度：{'、'.join(key_regressions[:4])}。")
    if key_gains:
        reasons.append(f"关键提升维度：{'、'.join(key_gains[:4])}。")
-    if truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"]:
-        reasons.append("A/B 对比稿使用了截断 transcript，长对话结论需结合逐轮评分复核。")
+    if (
+        truncation["baseline_truncated_for_compare"]
+        or truncation["replay_truncated_for_compare"]
+    ):
+        reasons.append(
+            "A/B 对比稿使用了截断 transcript，长对话结论需结合逐轮评分复核。"
+        )
+    if (
+        truncation["baseline_truncated_for_conversation"]
+        or truncation["replay_truncated_for_conversation"]
+    ):
+        reasons.append(
+            "整段评分可能仅见 transcript 前缀；长程维度已在评审边界下保守处理，请结合逐轮分或全文重跑交叉验证。"
+        )

    return {
        "fixture_filename": fixture_filename,
@@ -165,6 +220,7 @@ def build_conversation_compare_summary(
        "key_gains": key_gains,
        "repeat_issue_detected": has_repeat_regression,
        "truncation": truncation,
+        "evidence_quality": evidence_quality,
        "gate": {
            "status": status,
            "parity_passed": parity_passed,
@@ -173,4 +229,3 @@ def build_conversation_compare_summary(
            "golden_set_note": "建议在固定黄金样本集上复跑该口径，再决定是否发布。",
        },
    }
-
--- a/api/app/features/evaluation/eval_trace_format.py
+++ b/api/app/features/evaluation/eval_trace_format.py
@@ -59,9 +59,8 @@ def build_segment_transcript(
        user_txt = (seg.user_input_text or "").strip()
        ai_txt = (ai_by_segment.get(uid) or seg.agent_response or "").strip()
        id_extra = _segment_message_id_header(seg)
-        head = (
-            f"### Segment {i} · id={uid} · conversation={seg.conversation_id}"
-            + (f" · {id_extra}" if id_extra else "")
+        head = f"### Segment {i} · id={uid} · conversation={seg.conversation_id}" + (
+            f" · {id_extra}" if id_extra else ""
        )
        body_u = f"用户: {user_txt}" if user_txt else "用户: （空）"
        body_a = f"AI: {ai_txt}" if ai_txt else "AI: （无日志/无 agent_response）"
--- a/api/app/features/evaluation/eval_trace_repo.py
+++ b/api/app/features/evaluation/eval_trace_repo.py
@@ -296,9 +296,9 @@ async def load_summaries_by_ids(
    return list(result.scalars().all())


-def story_link_ids_by_type(links: list[StoryEvidenceLink]) -> tuple[
-    list[str], list[str], list[str], list[str]
-]:
+def story_link_ids_by_type(
+    links: list[StoryEvidenceLink],
+) -> tuple[list[str], list[str], list[str], list[str]]:
    chunks: list[str] = []
    facts: list[str] = []
    timelines: list[str] = []
--- a/api/app/features/evaluation/eval_trace_service.py
+++ b/api/app/features/evaluation/eval_trace_service.py
@@ -121,7 +121,9 @@ class EvalTraceService:
            return "partial"
        return "fallback"

-    async def build_chapter_bundle(self, user_id: str, chapter: Chapter) -> ChapterEvidenceBundle:
+    async def build_chapter_bundle(
+        self, user_id: str, chapter: Chapter
+    ) -> ChapterEvidenceBundle:
        notes: list[str] = []
        live_segment_ids = normalize_source_segment_ids(
            getattr(chapter, "source_segments", None)
@@ -130,7 +132,15 @@ class EvalTraceService:
        row = getattr(chapter, "current_evidence_snapshot", None)
        row_has_closure = bool(
            (row and (row.segment_ids or []))
-            or (row and (row.memory_chunk_ids or row.memory_fact_ids or row.timeline_event_ids or row.summary_ids))
+            or (
+                row
+                and (
+                    row.memory_chunk_ids
+                    or row.memory_fact_ids
+                    or row.timeline_event_ids
+                    or row.summary_ids
+                )
+            )
        )
        if (
            row is not None
@@ -139,19 +149,13 @@ class EvalTraceService:
            and int(row.schema_version or 0) == EVIDENCE_SNAPSHOT_SCHEMA_VERSION
            and row_has_closure
        ):
-            segment_ids = [
-                str(x) for x in (row.segment_ids or []) if str(x).strip()
-            ]
+            segment_ids = [str(x) for x in (row.segment_ids or []) if str(x).strip()]
            conv_ids = sorted(
                {str(x) for x in (row.conversation_ids or []) if str(x).strip()}
            )
-            chunk_ids = [
-                str(x) for x in (row.memory_chunk_ids or []) if str(x).strip()
-            ]
+            chunk_ids = [str(x) for x in (row.memory_chunk_ids or []) if str(x).strip()]
            fact_ids = [str(x) for x in (row.memory_fact_ids or []) if str(x).strip()]
-            tl_ids = [
-                str(x) for x in (row.timeline_event_ids or []) if str(x).strip()
-            ]
+            tl_ids = [str(x) for x in (row.timeline_event_ids or []) if str(x).strip()]
            sum_ids = [str(x) for x in (row.summary_ids or []) if str(x).strip()]
            notes.extend([str(x) for x in (row.notes or []) if x])
            notes.append("evidence_from_chapter_evidence_snapshot_table")
@@ -163,7 +167,9 @@ class EvalTraceService:
                sum_ids=sum_ids,
            )
            if live_segment_ids and set(live_segment_ids) != set(segment_ids):
-                notes.append("live_source_segments_differ_from_snapshot_reconcile_in_pipeline")
+                notes.append(
+                    "live_source_segments_differ_from_snapshot_reconcile_in_pipeline"
+                )
            dlg = getattr(row, "message_lineage_json", None)
            return ChapterEvidenceBundle(
                user_id=user_id,
@@ -202,14 +208,24 @@ class EvalTraceService:
        )

        if use_snap and isinstance(snap, dict):
-            segment_ids = [str(x) for x in (snap.get("segment_ids") or []) if str(x).strip()]
+            segment_ids = [
+                str(x) for x in (snap.get("segment_ids") or []) if str(x).strip()
+            ]
            conv_ids = sorted(
                {str(x) for x in (snap.get("conversation_ids") or []) if str(x).strip()}
            )
-            chunk_ids = [str(x) for x in (snap.get("memory_chunk_ids") or []) if str(x).strip()]
-            fact_ids = [str(x) for x in (snap.get("memory_fact_ids") or []) if str(x).strip()]
-            tl_ids = [str(x) for x in (snap.get("timeline_event_ids") or []) if str(x).strip()]
-            sum_ids = [str(x) for x in (snap.get("summary_ids") or []) if str(x).strip()]
+            chunk_ids = [
+                str(x) for x in (snap.get("memory_chunk_ids") or []) if str(x).strip()
+            ]
+            fact_ids = [
+                str(x) for x in (snap.get("memory_fact_ids") or []) if str(x).strip()
+            ]
+            tl_ids = [
+                str(x) for x in (snap.get("timeline_event_ids") or []) if str(x).strip()
+            ]
+            sum_ids = [
+                str(x) for x in (snap.get("summary_ids") or []) if str(x).strip()
+            ]
            notes.extend([str(x) for x in (snap.get("notes") or []) if x])
            notes.append("evidence_from_chapter_evidence_bundle_json_column")
            tier = self._chapter_closure_tier(
@@ -220,8 +236,12 @@ class EvalTraceService:
                sum_ids=sum_ids,
            )
            if live_segment_ids and set(live_segment_ids) != set(segment_ids):
-                notes.append("live_source_segments_differ_from_snapshot_reconcile_in_pipeline")
-            snap_dlg = snap.get("message_lineage_json") if isinstance(snap, dict) else None
+                notes.append(
+                    "live_source_segments_differ_from_snapshot_reconcile_in_pipeline"
+                )
+            snap_dlg = (
+                snap.get("message_lineage_json") if isinstance(snap, dict) else None
+            )
            return ChapterEvidenceBundle(
                user_id=user_id,
                chapter_id=str(chapter.id),
@@ -256,8 +276,15 @@ class EvalTraceService:
        resolved_seg_ids = [s.id for s in segments] or segment_ids
        if len(segments) < len(segment_ids):
            notes.append("some_segments_missing_or_foreign_user")
-        conv_ids = sorted({str(s.conversation_id) for s in segments if s.conversation_id})
-        chunk_ids, fact_ids, tl_ids, sum_ids = await fetch_memory_closure_for_conversations(
+        conv_ids = sorted(
+            {str(s.conversation_id) for s in segments if s.conversation_id}
+        )
+        (
+            chunk_ids,
+            fact_ids,
+            tl_ids,
+            sum_ids,
+        ) = await fetch_memory_closure_for_conversations(
            self._db, user_id=user_id, conversation_ids=conv_ids
        )
        tier = self._chapter_closure_tier(
@@ -339,8 +366,12 @@ class EvalTraceService:
        )
        return formatted, bundle

-    async def build_story_bundle(self, user_id: str, story_id: str) -> StoryEvidenceBundle:
-        st = await get_story_for_eval_trace(self._db, user_id=user_id, story_id=story_id)
+    async def build_story_bundle(
+        self, user_id: str, story_id: str
+    ) -> StoryEvidenceBundle:
+        st = await get_story_for_eval_trace(
+            self._db, user_id=user_id, story_id=story_id
+        )
        if not st:
            return StoryEvidenceBundle(
                user_id=user_id,
@@ -378,7 +409,9 @@ class EvalTraceService:
            segments = await fetch_segments_for_user(
                self._db, user_id=user_id, segment_ids=dedup_seg
            )
-            conv_ids = sorted({str(s.conversation_id) for s in segments if s.conversation_id})
+            conv_ids = sorted(
+                {str(s.conversation_id) for s in segments if s.conversation_id}
+            )
            if dedup_seg and not segments:
                notes.append("chapter_segment_ids_unresolved")
            if conv_ids:
@@ -428,11 +461,16 @@ class EvalTraceService:
            segments = await fetch_segments_for_user(
                self._db, user_id=user_id, segment_ids=dedup_seg
            )
-            conv_ids = sorted({str(s.conversation_id) for s in segments if s.conversation_id})
-            chunk_ids, fact_ids, tl_ids, sum_ids = (
-                await fetch_memory_closure_for_conversations(
-                    self._db, user_id=user_id, conversation_ids=conv_ids
-                )
+            conv_ids = sorted(
+                {str(s.conversation_id) for s in segments if s.conversation_id}
+            )
+            (
+                chunk_ids,
+                fact_ids,
+                tl_ids,
+                sum_ids,
+            ) = await fetch_memory_closure_for_conversations(
+                self._db, user_id=user_id, conversation_ids=conv_ids
            )
            notes.append("fallback_lineage_no_story_evidence_links")
            notes.append("augmented_with_chapter_context")
--- a/api/app/features/evaluation/judge_manual_service.py
+++ b/api/app/features/evaluation/judge_manual_service.py
@@ -27,7 +27,7 @@ from app.features.evaluation.eval_trace_service import EvalTraceService
 from app.features.evaluation.judge_schemas import ConversationJudgeOutput
 from app.features.evaluation.judge_service import (
    EvalJudgeService,
-    eval_judge_compare_transcript_each_max_chars_for_context,
+    eval_judge_compare_bundle_caps,
    eval_judge_conversation_transcript_max_chars_for_context,
 )
 from app.features.evaluation.schemas import MemoirSectionBaselineOut
@@ -234,6 +234,7 @@ class EvalJudgeManualService:
                f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
            )

+        _cmp_total, _cmp_per_side = eval_judge_compare_bundle_caps(judge._ctx_tokens)
        bundle: dict[str, Any] = {
            "version": 1,
            "judged_at": datetime.now(timezone.utc).isoformat(),
@@ -250,9 +251,8 @@ class EvalJudgeManualService:
                conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                    judge._ctx_tokens
                ),
-                compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
-                    judge._ctx_tokens
-                ),
+                compare_cap_total=_cmp_total,
+                compare_per_side_cap=_cmp_per_side,
                fixture_filename=fn,
            ),
            "compare_markdown": "",
@@ -363,6 +363,7 @@ class EvalJudgeManualService:

        acc["options"]["judge_model"] = resolved_model
        acc["fixture_filename"] = fn
+        _sse_cmp_total, _sse_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
        persist = True
        try:
            yield {
@@ -435,9 +436,7 @@ class EvalJudgeManualService:
                full_transcript=replay_transcript
            )
            replay_judge = replay_result.output
-            acc["replay_judge"] = (
-                replay_judge.model_dump() if replay_judge else None
-            )
+            acc["replay_judge"] = replay_judge.model_dump() if replay_judge else None
            acc["compare_summary"] = build_conversation_compare_summary(
                baseline_judge=baseline_judge,
                replay_judge=replay_judge,
@@ -446,9 +445,8 @@ class EvalJudgeManualService:
                conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                    judge._ctx_tokens
                ),
-                compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
-                    judge._ctx_tokens
-                ),
+                compare_cap_total=_sse_cmp_total,
+                compare_per_side_cap=_sse_cmp_per,
                fixture_filename=fn,
            )
            yield {
@@ -532,7 +530,9 @@ class EvalJudgeManualService:

        fn = (fixture_filename or "").strip() or None
        if not fn:
-            raise EvaluationBadRequestError("请选择基线 MD（fixture_filename）后再重试基准分")
+            raise EvaluationBadRequestError(
+                "请选择基线 MD（fixture_filename）后再重试基准分"
+            )

        try:
            turns, _ = read_user_export_fixture(fn)
@@ -568,6 +568,7 @@ class EvalJudgeManualService:
        judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
        if not judge:
            raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
+        _rt_cmp_total, _rt_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
        baseline_result = await judge.judge_conversation_result(
            full_transcript=baseline_transcript
        )
@@ -590,9 +591,8 @@ class EvalJudgeManualService:
                    conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                        judge._ctx_tokens
                    ),
-                    compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
-                        judge._ctx_tokens
-                    ),
+                    compare_cap_total=_rt_cmp_total,
+                    compare_per_side_cap=_rt_cmp_per,
                    fixture_filename=fn,
                ),
                "compare_markdown": "",
@@ -619,10 +619,7 @@ class EvalJudgeManualService:
                sse_event="baseline_turn_judge",
            ):
                idx = row.get("turn_index")
-                if (
-                    isinstance(idx, (int, float))
-                    and row.get("judge") is not None
-                ):
+                if isinstance(idx, (int, float)) and row.get("judge") is not None:
                    acc["baseline_turn_judges"][str(int(idx))] = row["judge"]

        acc["compare_markdown"] = ""
@@ -634,9 +631,8 @@ class EvalJudgeManualService:
            conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                judge._ctx_tokens
            ),
-            compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
-                judge._ctx_tokens
-            ),
+            compare_cap_total=_rt_cmp_total,
+            compare_per_side_cap=_rt_cmp_per,
            fixture_filename=fn,
        )
        async for piece in judge.stream_conversation_compare(
@@ -682,7 +678,10 @@ class EvalJudgeManualService:
        trace_svc = EvalTraceService(self._db)

        def _chapter_evidence_notes(
-            lineage_tier: str, evidence_summary: str, truncated: bool, dropped: list[str]
+            lineage_tier: str,
+            evidence_summary: str,
+            truncated: bool,
+            dropped: list[str],
        ) -> str:
            drops = ",".join(dropped[:12]) if dropped else ""
            return (
--- a/api/app/features/evaluation/judge_schemas.py
+++ b/api/app/features/evaluation/judge_schemas.py
@@ -118,7 +118,9 @@ class TurnJudgeOutput(BaseModel):

    @model_validator(mode="after")
    def _cap_meta_fields_and_sync_totals(self) -> Self:
-        def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
+        def _cap_str_list(
+            xs: list[str], *, max_items: int, max_chars: int
+        ) -> list[str]:
            out: list[str] = []
            for x in xs[:max_items]:
                s = str(x).strip()
@@ -257,7 +259,9 @@ class MemoirJudgeOutput(BaseModel):

    @model_validator(mode="after")
    def _cap_meta_fields_and_sync_totals(self) -> Self:
-        def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
+        def _cap_str_list(
+            xs: list[str], *, max_items: int, max_chars: int
+        ) -> list[str]:
            out: list[str] = []
            for x in xs[:max_items]:
                s = str(x).strip()
--- a/api/app/features/evaluation/judge_service.py
+++ b/api/app/features/evaluation/judge_service.py
@@ -90,26 +90,127 @@ def eval_judge_turn_prior_transcript_max_chars_for_context(


 def eval_judge_compare_transcript_each_max_chars() -> int:
-    """A/B 两段 transcript 同 prompt 时，每条 transcript 的上限（默认 GLM 上下文）。"""
-    if settings.eval_judge_max_compare_transcript_chars_each > 0:
-        return settings.eval_judge_max_compare_transcript_chars_each
-    pool = (
-        _eval_judge_prompt_char_pool()
-        - settings.eval_judge_compare_prompt_overhead_chars
+    """单侧对称参考上限（默认与 settings.eval_judge_context_window_tokens 一致）。"""
+    return eval_judge_compare_transcript_each_max_chars_for_context(
+        settings.eval_judge_context_window_tokens
    )
-    return max(1, pool // 2)
+
+
+def eval_judge_compare_transcript_pair_total_budget_for_context(
+    context_window_tokens: int,
+) -> int:
+    """A/B 同 prompt 时，两份 transcript 合计最大字符数（已扣对比模板与双份 JSON 等开销）。"""
+    if settings.eval_judge_max_compare_transcript_chars_each > 0:
+        return max(1, 2 * int(settings.eval_judge_max_compare_transcript_chars_each))
+    pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
+    return max(1, pool - int(settings.eval_judge_compare_prompt_overhead_chars))


 def eval_judge_compare_transcript_each_max_chars_for_context(
    context_window_tokens: int,
 ) -> int:
+    """单侧对称上限的参考值（auto 模式下约为合计预算的一半；供兼容与展示）。"""
    if settings.eval_judge_max_compare_transcript_chars_each > 0:
-        return settings.eval_judge_max_compare_transcript_chars_each
-    pool = (
-        _eval_judge_prompt_char_pool_for_context(context_window_tokens)
-        - settings.eval_judge_compare_prompt_overhead_chars
+        return int(settings.eval_judge_max_compare_transcript_chars_each)
+    total = eval_judge_compare_transcript_pair_total_budget_for_context(
+        context_window_tokens
    )
-    return max(1, pool // 2)
+    return max(1, total // 2)
+
+
+def eval_judge_compare_bundle_caps(
+    context_window_tokens: int,
+) -> tuple[int, int | None]:
+    """返回 (compare_cap_total, per_side_cap|None)，供 Playground 摘要与流式对比共用。"""
+    per = int(settings.eval_judge_max_compare_transcript_chars_each or 0)
+    if per > 0:
+        return max(1, 2 * per), per
+    return eval_judge_compare_transcript_pair_total_budget_for_context(
+        context_window_tokens
+    ), None
+
+
+def trim_compare_transcript_pair(
+    baseline: str,
+    replay: str,
+    *,
+    total_max_chars: int,
+    per_side_max_chars: int | None = None,
+) -> tuple[str, str, bool, bool]:
+    """A/B 对比 prompt 用：在合计预算内尽量保留全文；仅超长时优先从较长的一侧裁尾部。
+
+    若配置了 eval_judge_max_compare_transcript_chars_each，则仍按单侧硬顶（与旧行为一致）。
+    """
+    b = (baseline or "").strip()
+    r = (replay or "").strip()
+    if per_side_max_chars is not None and int(per_side_max_chars) > 0:
+        cap = int(per_side_max_chars)
+        return b[:cap], r[:cap], len(b) > cap, len(r) > cap
+
+    cap_total = max(1, int(total_max_chars))
+    if len(b) + len(r) <= cap_total:
+        return b, r, False, False
+
+    need_drop = len(b) + len(r) - cap_total
+    b2, r2 = b, r
+    while need_drop > 0 and (b2 or r2):
+        if len(b2) >= len(r2):
+            if b2:
+                b2 = b2[:-1]
+                need_drop -= 1
+            elif r2:
+                r2 = r2[:-1]
+                need_drop -= 1
+            else:
+                break
+        else:
+            if r2:
+                r2 = r2[:-1]
+                need_drop -= 1
+            elif b2:
+                b2 = b2[:-1]
+                need_drop -= 1
+            else:
+                break
+    return b2, r2, len(b) > len(b2), len(r) > len(r2)
+
+
+_CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL = (
+    "\n\n【评审边界——输入已为截断稿】\n"
+    "以上仅为全文前 {n} 个字符，其后未提供给模型。"
+    "对依赖长程多轮轨迹的细项（尤其 context_memory、interview_structure、跨轮重复盘问）"
+    "必须保守给分（倾向区间中低），并在 insufficient_evidence 写明「输入为截断稿，长程证据不足」；"
+    "不得臆断未展示轮次中的行为；confidence 须显著降低；禁止因未见问题而默认高分或推断后半段无缺陷。\n"
+)
+
+_TURN_PRIOR_TRUNCATION_TAIL = (
+    "\n\n【评审边界——上文节选已截断】\n"
+    "「截至上一轮」节选可能仅为更长对话的前 {n} 字；跨轮重复、长程结构若无法从节选核实，"
+    "须在 insufficient_evidence 说明，并对相关细项保守给分。\n"
+)
+
+_COMPARE_STREAM_PAIR_TRUNCATION_NOTE = (
+    "\n【评审边界】以下 A/B transcript 至少一侧为截断稿，请仅就**已展示片段**比较；"
+    "不得断言未展示轮次的优劣；涉及跨轮重复盘问等须明确证据范围或说不足以判断。\n"
+)
+
+
+def conversation_judge_transcript_excerpt(full_transcript: str, cap: int) -> str:
+    """整段评审：在 cap 内截断时在正文后附加边界说明，减少「假装看了全文」的幻觉打分。"""
+    raw = (full_transcript or "").strip()
+    c = max(0, int(cap))
+    if len(raw) <= c:
+        return raw
+    return raw[:c] + _CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL.format(n=c)
+
+
+def turn_judge_prior_excerpt(prior_transcript: str, cap: int) -> str:
+    """逐轮评审里「截至上一轮」节选；截断时附加边界说明。"""
+    raw = (prior_transcript or "").strip()
+    c = max(0, int(cap))
+    if len(raw) <= c:
+        return raw
+    return raw[:c] + _TURN_PRIOR_TRUNCATION_TAIL.format(n=c)


@dataclass(slots=True)
@@ -163,9 +264,7 @@ def _build_memoir_judge_prompt(
            ]
        )
    if struct:
-        sections.extend(
-            ["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""]
-        )
+        sections.extend(["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""])
    else:
        sections.extend(
            [
@@ -198,14 +297,7 @@ class EvalJudgeService:
        )

    def _turn_prior_cap(self) -> int:
-        return eval_judge_turn_prior_transcript_max_chars_for_context(
-            self._ctx_tokens
-        )
-
-    def _compare_each_cap(self) -> int:
-        return eval_judge_compare_transcript_each_max_chars_for_context(
-            self._ctx_tokens
-        )
+        return eval_judge_turn_prior_transcript_max_chars_for_context(self._ctx_tokens)

    async def judge_turn(
        self,
@@ -223,7 +315,7 @@ class EvalJudgeService:
 【本轮位置】完整对话中当前轮次为 Turn {t + 1}（与下方节选及全量 transcript 的 `[Turn ...]` 编号一致）。evidence_refs.turn_index 请使用该编号。

 【截至上一轮的对话节选】（含 `[Turn k]` 标签）
-{prior_transcript[: self._turn_prior_cap()]}
+{turn_judge_prior_excerpt(prior_transcript, self._turn_prior_cap())}

 【本轮用户】
 {user_utterance[:4000]}
@@ -254,7 +346,7 @@ class EvalJudgeService:
        prompt = f"""{CONV_JUDGE_INSTRUCTIONS}

 【完整对话】（每轮以 `[Turn k]` 开头）
-{full_transcript[: self._conv_transcript_cap()]}
+{conversation_judge_transcript_excerpt(full_transcript, self._conv_transcript_cap())}
 """
        try:
            out = await allm_json_call(
@@ -288,10 +380,15 @@ class EvalJudgeService:
        if not self._llm:
            yield "[错误] 未配置评审模型 API Key（智谱：eval_judge_api_key / zhipu_api_key；DeepSeek：deepseek_api_key）"
            return
-        cap_each = self._compare_each_cap()
+        cap_total, per_side = eval_judge_compare_bundle_caps(self._ctx_tokens)
        cap_single = self._conv_transcript_cap()
-        b_tr = (baseline_transcript or "").strip()[:cap_each]
-        r_tr = (replay_transcript or "").strip()[:cap_each]
+        b_tr, r_tr, b_cmp_trunc, r_cmp_trunc = trim_compare_transcript_pair(
+            baseline_transcript or "",
+            replay_transcript or "",
+            total_max_chars=cap_total,
+            per_side_max_chars=per_side,
+        )
+        compare_pair_truncated = b_cmp_trunc or r_cmp_trunc
        b_json = (
            baseline_judge.model_dump_json(ensure_ascii=False)
            if baseline_judge
@@ -301,14 +398,17 @@ class EvalJudgeService:
            replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
        )
        if baseline_judge and replay_judge:
-            prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分（JSON）。请用中文直接写正文（不要用 JSON、不要用 Markdown 代码块）：
+            trunc_line = (
+                _COMPARE_STREAM_PAIR_TRUNCATION_NOTE if compare_pair_truncated else ""
+            )
+            prompt = f"""你是访谈对话评测专家。下面给出两份对话 transcript 及各自的整体打分（JSON）。请用中文直接写正文（不要用 JSON、不要用 Markdown 代码块）：

 【A：导出基准对话】（历史快照：用户与当时导出的线上 AI，多轮合并为一篇）
 {b_tr}

 【B：本次回放/新测对话】（用户句与基准对齐，AI 为当前后端重新生成）
 {r_tr}
-
+{trunc_line}
 【A 的整体评分 JSON】
 {b_json}

@@ -322,7 +422,9 @@ class EvalJudgeService:

 笔调简洁、偏执行清单。"""
        elif replay_judge:
-            r_one = (replay_transcript or "").strip()[:cap_single]
+            r_one = conversation_judge_transcript_excerpt(
+                replay_transcript or "", cap_single
+            )
            prompt = f"""{COMPARE_CONV_STREAM_HINT}

 【回放/新测 transcript】
--- a/api/app/features/evaluation/memoir_readiness_service.py
+++ b/api/app/features/evaluation/memoir_readiness_service.py
@@ -45,7 +45,8 @@ class MemoirReadinessService:
        missing = [i for i in ids if i not in found_ids]
        if missing:
            raise EvaluationBadRequestError(
-                "segment not in conversation: " + ", ".join(missing[:5])
+                "segment not in conversation: "
+                + ", ".join(missing[:5])
                + ("…" if len(missing) > 5 else "")
            )

--- a/api/app/features/evaluation/replay_service.py
+++ b/api/app/features/evaluation/replay_service.py
@@ -165,11 +165,7 @@ class ReplayConversationService:
            )
            count += 1

-        if (
-            flush_memoir_after
-            and conv.user_id
-            and (not skip_memoir)
-        ):
+        if flush_memoir_after and conv.user_id and (not skip_memoir):
            await background_runner.flush_pending(conv.user_id)

        logger.info(
--- a/api/app/features/evaluation/router.py
+++ b/api/app/features/evaluation/router.py
@@ -173,9 +173,7 @@ async def get_playground_conversation_judge(
 async def memoir_phase1_ready(
    conversation_id: str,
    _auth: InternalEvalAuth,
-    svc: Annotated[
-        MemoirReadinessService, Depends(get_memoir_readiness_service)
-    ],
+    svc: Annotated[MemoirReadinessService, Depends(get_memoir_readiness_service)],
    segment_ids: Annotated[
        list[str],
        Query(
@@ -202,9 +200,7 @@ async def memoir_phase1_ready(
 async def memoir_submit_phase1(
    conversation_id: str,
    _auth: InternalEvalAuth,
-    svc: Annotated[
-        MemoirReadinessService, Depends(get_memoir_readiness_service)
-    ],
+    svc: Annotated[MemoirReadinessService, Depends(get_memoir_readiness_service)],
 ):
    try:
        return await svc.submit_memoir_phase1_for_conversation(
--- a/api/app/features/evaluation/rubrics/conversation_v1.py
+++ b/api/app/features/evaluation/rubrics/conversation_v1.py
@@ -21,8 +21,9 @@ _TURN_SCOPE = """

 _CONV_SCOPE = """
 ## 整段对话评审范围
- 在**完整 transcript**上，对 AI **多轮轨迹**做一次 holistic 评分（仍为同一 15 细项）。
- **聚合规则**：以「整段中**典型表现** + **最严重且反复出现的缺陷**」综合定档；若某维度在多轮中明显滑落，该维不得按最好一轮给满分。
+- 在输入所给的 transcript（含 `[Turn k]`）上，对 AI **多轮轨迹**做一次 holistic 评分（仍为同一 15 细项）。
+- 若正文后出现系统注入的「【评审边界——输入已为截断稿】」说明，则**只据此片段**评分：长程细项须保守，`confidence` 降低，并在 `insufficient_evidence` 声明证据范围；**禁止**臆断未展示轮次、**禁止**因未见缺陷而默认高分。
+- **聚合规则**：以**已展示轮次中**典型表现 + 最严重且反复的缺陷综合定档；若某维度在多轮中明显滑落，该维不得按最好一轮给满分。
 - 维度边界：`context_memory` 负责**重复盘问、前后矛盾追问、忽略已答信息**；`emotion_carry` 负责**情绪是否被接住**（不与采访腔混扣）；`rhythm_control` 负责**采访腔、总结腔、机械流程感**（本轮已承接情绪但仍像审讯，在此项体现）。

 """
--- a/api/app/features/evaluation/transcript_for_judge.py
+++ b/api/app/features/evaluation/transcript_for_judge.py
@@ -31,7 +31,9 @@ def format_export_turns_with_labels(turns: list[tuple[str, str]]) -> str:
    return "\n\n".join(parts)


-def pair_session_messages_to_turns(messages: list[_MessageLike] | list[Any]) -> list[tuple[str, str]]:
+def pair_session_messages_to_turns(
+    messages: list[_MessageLike] | list[Any],
+) -> list[tuple[str, str]]:
    """将对话消息序列为 (user, assistant) 轮次列表，语义与 `format_session_messages_with_turn_labels` 一致。

    末尾仅有 human、无紧随 assistant 时，补一轮 (user, "") 供 UI 与评审对齐。
@@ -56,7 +58,9 @@ def pair_session_messages_to_turns(messages: list[_MessageLike] | list[Any]) ->
    return out


-def format_session_messages_with_turn_labels(messages: list[_MessageLike] | list[Any]) -> str:
+def format_session_messages_with_turn_labels(
+    messages: list[_MessageLike] | list[Any],
+) -> str:
    """会话消息序列：按出现顺序将相邻 human→assistant 合并为一轮。"""
    blocks: list[str] = []
    turn_idx = 0
--- a/api/app/features/memoir/chapter_evidence_snapshot.py
+++ b/api/app/features/memoir/chapter_evidence_snapshot.py
@@ -195,7 +195,9 @@ def refresh_chapter_evidence_snapshot_sync(session: Session, chapter_id: str) ->
        chapter_id=str(ch.id),
        user_id=str(ch.user_id),
        version_no=next_v,
-        schema_version=int(payload.get("schema_version") or EVIDENCE_SNAPSHOT_SCHEMA_VERSION),
+        schema_version=int(
+            payload.get("schema_version") or EVIDENCE_SNAPSHOT_SCHEMA_VERSION
+        ),
        segment_ids=list(payload.get("segment_ids") or []),
        conversation_ids=list(payload.get("conversation_ids") or []),
        story_ids=list(payload.get("story_ids") or []),
@@ -209,7 +211,9 @@ def refresh_chapter_evidence_snapshot_sync(session: Session, chapter_id: str) ->
    )
    session.add(snap)
    session.flush()
-    _replace_chapter_evidence_links_sync(session, chapter_id=str(ch.id), payload=payload)
+    _replace_chapter_evidence_links_sync(
+        session, chapter_id=str(ch.id), payload=payload
+    )
    ch.current_evidence_snapshot_id = snap.id
    ch.evidence_bundle_json = payload
    if payload.get("message_lineage_json") is not None:
--- a/api/app/features/memoir/story_pipeline_sync.py
+++ b/api/app/features/memoir/story_pipeline_sync.py
@@ -98,7 +98,9 @@ def _dialogue_lineage_dict_for_segment_ids(
    )


-def _evidence_link_ids(evidence: dict) -> tuple[list[str], list[str], list[str], list[str]]:
+def _evidence_link_ids(
+    evidence: dict,
+) -> tuple[list[str], list[str], list[str], list[str]]:
    """从 retrieve_evidence_sync 结果提取稳定 ID 列表。"""
    chunks: list[str] = []
    for c in evidence.get("relevant_chunks") or []:
@@ -661,9 +663,7 @@ def _resolve_append_target(
        and len(oral_norm)
        <= int(settings.memoir_story_route_append_guardrail_oral_chars)
    ):
-        tid_g = default_append_target_story_id(
-            candidate_stories, story_meta, settings
-        )
+        tid_g = default_append_target_story_id(candidate_stories, story_meta, settings)
        if tid_g:
            st = session.get(Story, tid_g)
            if st and st.user_id == user_id:
@@ -880,17 +880,19 @@ def _run_batch_plan_writes(
        unit_text = _ordered_text_for_segment_ids(category_segments, unit.segment_ids)
        oral_unit = normalize_oral_for_memoir(unit_text, llm=llm)

-        target_story_id, existing_for_narrative, decision_source = _resolve_append_target(
-            session,
-            route_decision=unit.decision,
-            route_target_story_id=unit.target_story_id,
-            user_id=user_id,
-            chapter_category=chapter_category,
-            oral_norm=(oral_unit or "").strip(),
-            candidate_stories=candidate_stories,
-            story_meta=story_meta,
-            decision_source="batch_plan",
-            memoir_correlation_id=memoir_correlation_id,
+        target_story_id, existing_for_narrative, decision_source = (
+            _resolve_append_target(
+                session,
+                route_decision=unit.decision,
+                route_target_story_id=unit.target_story_id,
+                user_id=user_id,
+                chapter_category=chapter_category,
+                oral_norm=(oral_unit or "").strip(),
+                candidate_stories=candidate_stories,
+                story_meta=story_meta,
+                decision_source="batch_plan",
+                memoir_correlation_id=memoir_correlation_id,
+            )
        )

        sid, _ = _execute_narrative_unit(
@@ -1104,17 +1106,19 @@ def run_story_pipeline_for_category_batch(
        )

        decision_source = "fallback_no_llm" if not llm else "single_decide"
-        target_story_id, existing_for_narrative, decision_source = _resolve_append_target(
-            session,
-            route_decision=route.decision,
-            route_target_story_id=route.target_story_id,
-            user_id=user_id,
-            chapter_category=chapter_category,
-            oral_norm=om_norm,
-            candidate_stories=candidates,
-            story_meta=story_meta,
-            decision_source=decision_source,
-            memoir_correlation_id=memoir_correlation_id,
+        target_story_id, existing_for_narrative, decision_source = (
+            _resolve_append_target(
+                session,
+                route_decision=route.decision,
+                route_target_story_id=route.target_story_id,
+                user_id=user_id,
+                chapter_category=chapter_category,
+                oral_norm=om_norm,
+                candidate_stories=candidates,
+                story_meta=story_meta,
+                decision_source=decision_source,
+                memoir_correlation_id=memoir_correlation_id,
+            )
        )

        sid, _ = _execute_narrative_unit(
--- a/api/app/features/memory/chunker.py
+++ b/api/app/features/memory/chunker.py
@@ -1,7 +1,6 @@
 """Transcript chunker — split raw text into retrieval-ready chunks."""


-
 def chunk_transcript(
    text: str, *, max_chars: int = 800, overlap_chars: int = 100
 ) -> list[str]:
--- a/api/app/features/memory/service.py
+++ b/api/app/features/memory/service.py
@@ -21,7 +21,9 @@ from app.features.memory.repo import (
    set_memory_fact_status,
    update_chunk_embedding,
 )
-from app.features.conversation.lineage_schemas import primary_user_message_id_from_lineage
+from app.features.conversation.lineage_schemas import (
+    primary_user_message_id_from_lineage,
+)
 from app.features.memory.schemas import EvidenceBundle
 from app.ports.embedding import EmbeddingProvider

@@ -55,9 +57,7 @@ class MemoryService:
            raise ValueError("transcript cannot be empty")

        primary_mid = (
-            primary_user_message_id_from_lineage(lineage_json)
-            if lineage_json
-            else None
+            primary_user_message_id_from_lineage(lineage_json) if lineage_json else None
        )
        source = await create_source(
            self._db,
--- a/api/app/features/tasks/deps.py
+++ b/api/app/features/tasks/deps.py
@@ -1,6 +1,5 @@
 """Tasks feature 依赖：提供 get_tasks_service。"""

-
 from app.features.tasks.service import TasksService