feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
2026-04-07 17:15:01 +08:00
parent a50b72e7b5
commit 99543d04c6
47 changed files with 4968 additions and 1279 deletions
--- a/api/app/features/evaluation/execution_service.py
+++ b/api/app/features/evaluation/execution_service.py
@@ -20,6 +20,10 @@ from app.features.evaluation.gate_report_service import gate_result_to_details
 from app.features.evaluation.gating_service import compute_gate
 from app.features.evaluation.judge_service import EvalJudgeService
 from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
+from app.features.evaluation.transcript_for_judge import (
+    assistant_text_for_eval_display,
+    format_eval_turn_block,
+)

 logger = get_logger(__name__)

@@ -39,13 +43,23 @@ def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) ->

 def _composite(
    conv: float | None, mem: float | None, weights: dict[str, Any] | None
-) -> float:
+) -> float | None:
+    """合成总分；缺失的一侧不计为 0，避免把评审失败误标为极差。
+
+    仅一侧有分：返回该侧原始分（不乘权重），表示当前 run 仅完成了部分评审维度。
+    """
    w = weights or {}
    wc = float(w.get("conversation", 0.5))
    wm = float(w.get("memoir", 0.5))
-    c = float(conv or 0)
-    m = float(mem or 0)
-    return wc * c + wm * m
+    has_c = conv is not None
+    has_m = mem is not None
+    if not has_c and not has_m:
+        return None
+    if has_c and has_m:
+        return float(wc) * float(conv) + float(wm) * float(mem)
+    if has_c:
+        return float(conv)
+    return float(mem)


 def _utterances_for_case(case: EvalCase) -> list[str]:
@@ -53,11 +67,6 @@ def _utterances_for_case(case: EvalCase) -> list[str]:
    return [str(u).strip() for u in raw if str(u).strip()]


-def _assistant_text_for_eval_display(raw: str) -> str:
-    """评审与 transcript 展示：避免字面量 [SPLIT] 干扰 judge 阅读。"""
-    return (raw or "").replace("[SPLIT]", "\n")
-
-
 def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
    s = (text or "").strip()
    if len(s) <= max_chars:
@@ -72,7 +81,7 @@ def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
        if not body:
            continue
        label = "用户" if role == "human" else "AI"
-        out = _assistant_text_for_eval_display(body) if role != "human" else body
+        out = assistant_text_for_eval_display(body) if role != "human" else body
        parts.append(f"{label}: {out}")
    return "\n\n".join(parts)

@@ -179,18 +188,22 @@ async def execute_eval_run(
        if i >= len(replies):
            break
        transcript_parts.append(
-            f"用户: {u}\nAI: {_assistant_text_for_eval_display(replies[i])}"
+            format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i]))
        )
-    prior = ""
+    prior_blocks: list[str] = []
    for idx, u in enumerate(utterances):
        if idx >= len(replies):
            break
-        reply = _assistant_text_for_eval_display(replies[idx])
+        reply = assistant_text_for_eval_display(replies[idx])
        lat = latencies[idx] if idx < len(latencies) else None
+        prior = "\n\n".join(prior_blocks)
+        if len(prior) > 8000:
+            prior = prior[-8000:]
        tj = await judge.judge_turn(
            prior_transcript=prior,
            user_utterance=u,
            assistant_reply=reply,
+            turn_index_0=idx,
        )
        scores = tj.model_dump() if tj else None
        rationale = tj.rationale if tj else None
@@ -205,7 +218,7 @@ async def execute_eval_run(
            judge_rationale=rationale,
        )
        await db.commit()
-        prior = (prior + f"\n用户: {u}\nAI: {reply}")[-8000:]
+        prior_blocks.append(format_eval_turn_block(idx, u, reply))

    full_transcript = "\n\n".join(transcript_parts)
    conv_out = await judge.judge_conversation(full_transcript=full_transcript)
@@ -261,6 +274,7 @@ async def execute_eval_run(
                    reference_memoir_markdown=reference_memoir,
                    evidence_notes=(
                        "这是用户现有章节的严格评审；真实性、覆盖率、可追溯性必须对照原始访谈证据。"
+                        " 评审范围：单章节节选；跨全书连贯性仅在与证据一致时评估，否则保守打分并在 insufficient_evidence 说明。"
                    ),
                )
                chapter_entries.append(
@@ -287,6 +301,7 @@ async def execute_eval_run(
                    reference_memoir_markdown=reference_memoir,
                    evidence_notes=(
                        "这是用户现有故事的严格评审；真实性、覆盖率、可追溯性必须对照原始访谈证据。"
+                        " 评审范围：单故事节选；跨篇章关联若证据不足须保守并在 insufficient_evidence 说明。"
                    ),
                )
                story_entries.append(
@@ -300,18 +315,31 @@ async def execute_eval_run(
        except Exception as e:
            logger.warning("eval story judges skipped: {}", e)

-    mem_parts: list[float] = []
+    synth_scores: list[float] = []
    if mem_out is not None:
-        mem_parts.append(float(mem_out.total_score))
+        synth_scores.append(float(mem_out.total_score))
+
+    library_scores: list[float] = []
    for row in chapter_entries:
        j = row.get("judge")
        if isinstance(j, dict) and j.get("total_score") is not None:
-            mem_parts.append(float(j["total_score"]))
+            library_scores.append(float(j["total_score"]))
    for row in story_entries:
        j = row.get("judge")
        if isinstance(j, dict) and j.get("total_score") is not None:
-            mem_parts.append(float(j["total_score"]))
-    mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
+            library_scores.append(float(j["total_score"]))
+
+    def _mean(xs: list[float]) -> float:
+        return sum(xs) / len(xs) if xs else 0.0
+
+    if synth_scores and library_scores:
+        mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores)
+    elif synth_scores:
+        mem_total = _mean(synth_scores)
+    elif library_scores:
+        mem_total = _mean(library_scores)
+    else:
+        mem_total = None

    exp = await eval_repo.get_experiment(db, str(run.experiment_id))
    weights = (
@@ -326,6 +354,17 @@ async def execute_eval_run(
        "memoir_judge": mem_out.model_dump() if mem_out else None,
        "chapters": chapter_entries,
        "stories": story_entries,
+        "judge_meta": {
+            "conversation_judge_ok": conv_out is not None,
+            "memoir_synthetic_ok": mem_out is not None,
+            "memoir_synth_scores_n": len(synth_scores),
+            "memoir_library_scores_n": len(library_scores),
+            "memoir_aggregate_rule": (
+                "synth_plus_library_weighted_mean"
+                if synth_scores and library_scores
+                else ("synthetic_only" if synth_scores else "library_only")
+            ),
+        },
    }
    await eval_repo.update_run(
        db,