feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
2026-04-07 17:15:01 +08:00
parent a50b72e7b5
commit 99543d04c6
47 changed files with 4968 additions and 1279 deletions
--- a/api/app/features/evaluation/judge_manual_service.py
+++ b/api/app/features/evaluation/judge_manual_service.py
@@ -15,8 +15,14 @@ from app.features.evaluation.errors import (
    EvaluationBadRequestError,
    EvaluationNotFoundError,
 )
-from app.features.evaluation.execution_service import _assistant_text_for_eval_display
 from app.features.evaluation.judge_service import EvalJudgeService
+from app.features.evaluation.transcript_for_judge import (
+    assistant_text_for_eval_display,
+    format_eval_turn_block,
+    format_export_turns_with_labels,
+    format_session_messages_with_turn_labels,
+    pair_session_messages_to_turns,
+)
 from app.features.evaluation.schemas import MemoirSectionBaselineOut
 from app.features.evaluation.session_catalog_service import SessionCatalogService
 from app.features.evaluation.user_export_fixtures import read_user_export_fixture
@@ -30,6 +36,36 @@ _MAX_EVAL_CHAPTERS = 30
 _MAX_EVAL_STORIES = 40
 _MAX_EVIDENCE_CONVERSATIONS = 8
 _MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
+_PRIOR_TRANSCRIPT_MAX_CHARS = 8000
+
+
+async def _iter_turn_judgments_for_turns(
+    judge: EvalJudgeService,
+    turns: list[tuple[str, str]],
+    *,
+    sse_event: str,
+) -> AsyncIterator[dict[str, Any]]:
+    """与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。"""
+    prior_blocks: list[str] = []
+    for idx, (u_raw, ai_raw) in enumerate(turns):
+        u = (u_raw or "").strip()
+        reply = assistant_text_for_eval_display(str(ai_raw))
+        prior = "\n\n".join(prior_blocks)
+        if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS:
+            prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:]
+        tj = await judge.judge_turn(
+            prior_transcript=prior,
+            user_utterance=u,
+            assistant_reply=reply,
+            turn_index_0=idx,
+        )
+        yield {
+            "event": sse_event,
+            "turn_index": idx,
+            "ok": tj is not None,
+            "judge": tj.model_dump() if tj else None,
+        }
+        prior_blocks.append(format_eval_turn_block(idx, u, reply))


 def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -39,18 +75,6 @@ def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) ->
    return f"{s[:max_chars]}\n\n…（已截断供评审）"


-def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
-    parts: list[str] = []
-    for u, ai in turns:
-        u = (u or "").strip()
-        ai = (ai or "").strip()
-        if u:
-            parts.append(f"用户: {u}")
-        if ai:
-            parts.append(f"AI: {_assistant_text_for_eval_display(ai)}")
-    return "\n\n".join(parts)
-
-
 def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
    s = (text or "").strip()
    if len(s) <= max_chars:
@@ -62,16 +86,7 @@ async def _conversation_transcript_for_manual(
    db: AsyncSession, conversation_id: str
 ) -> str:
    rows = await conversation_repo.get_conversation_messages(conversation_id, db)
-    parts: list[str] = []
-    for row in rows:
-        role = (row.role or "").lower()
-        body = (row.content or "").strip()
-        if not body:
-            continue
-        label = "用户" if role == "human" else "AI"
-        out = _assistant_text_for_eval_display(body) if role != "human" else body
-        parts.append(f"{label}: {out}")
-    return "\n\n".join(parts)
+    return format_session_messages_with_turn_labels(rows)


 async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
@@ -125,14 +140,9 @@ class EvalJudgeManualService:
        if not dialogue:
            raise EvaluationNotFoundError("conversation not found")

-        parts: list[str] = []
-        for m in dialogue.messages:
-            r = (m.role or "").lower()
-            label = "用户" if r == "human" else "AI"
-            raw = m.content or ""
-            out = _assistant_text_for_eval_display(raw) if r != "human" else raw
-            parts.append(f"{label}: {out}")
-        replay_transcript = "\n\n".join(parts)
+        replay_transcript = format_session_messages_with_turn_labels(
+            list(dialogue.messages)
+        )
        if not replay_transcript.strip():
            raise EvaluationBadRequestError("no messages to judge")

@@ -141,7 +151,7 @@ class EvalJudgeManualService:
        if fn:
            try:
                turns, _ = read_user_export_fixture(fn)
-                baseline_transcript = _transcript_from_export_turns(turns)
+                baseline_transcript = format_export_turns_with_labels(turns)
            except ValueError as e:
                raise EvaluationBadRequestError(str(e)) from e
            except FileNotFoundError as e:
@@ -152,18 +162,28 @@ class EvalJudgeManualService:
        judge = EvalJudgeService(judge_llm)
        baseline_judge_dict: dict[str, Any] | None = None
        if baseline_transcript.strip():
-            bj = await judge.judge_conversation(full_transcript=baseline_transcript)
+            baseline_result = await judge.judge_conversation_result(
+                full_transcript=baseline_transcript
+            )
+            bj = baseline_result.output
            if bj:
                baseline_judge_dict = bj.model_dump()
            else:
-                errors.append("baseline_glm_failed")
+                errors.append(
+                    f"baseline_glm_failed: {baseline_result.error or 'unknown error'}"
+                )
        elif fn:
            errors.append("baseline_transcript_empty")

-        rj = await judge.judge_conversation(full_transcript=replay_transcript)
+        replay_result = await judge.judge_conversation_result(
+            full_transcript=replay_transcript
+        )
+        rj = replay_result.output
        replay_judge_dict = rj.model_dump() if rj else None
        if not rj:
-            errors.append("replay_glm_failed")
+            errors.append(
+                f"replay_glm_failed: {replay_result.error or 'unknown error'}"
+            )

        return {
            "conversation_id": cid,
@@ -179,8 +199,11 @@ class EvalJudgeManualService:
        self,
        conversation_id: str,
        fixture_filename: str | None,
+        *,
+        include_turn_judges: bool = False,
+        include_baseline_turn_judges: bool = False,
    ) -> AsyncIterator[dict[str, Any]]:
-        """供 SSE：先整体基准分、再整体回放分，再流式对比与建议。"""
+        """供 SSE：先整体基准分、再整体回放分，可选逐轮分，再流式对比与建议。"""
        cid = (conversation_id or "").strip()
        if not cid:
            yield {
@@ -200,24 +223,21 @@ class EvalJudgeManualService:
            }
            return

-        parts: list[str] = []
-        for m in dialogue.messages:
-            r = (m.role or "").lower()
-            label = "用户" if r == "human" else "AI"
-            raw = m.content or ""
-            out = _assistant_text_for_eval_display(raw) if r != "human" else raw
-            parts.append(f"{label}: {out}")
-        replay_transcript = "\n\n".join(parts)
+        replay_transcript = format_session_messages_with_turn_labels(
+            list(dialogue.messages)
+        )
        if not replay_transcript.strip():
            yield {"event": "error", "phase": "load", "message": "no messages to judge"}
            return

        fn = (fixture_filename or "").strip() or None
        baseline_transcript = ""
+        export_turns: list[tuple[str, str]] | None = None
        if fn:
            try:
                turns, _ = read_user_export_fixture(fn)
-                baseline_transcript = _transcript_from_export_turns(turns)
+                export_turns = list(turns)
+                baseline_transcript = format_export_turns_with_labels(turns)
            except ValueError as e:
                yield {"event": "error", "phase": "fixture", "message": str(e)}
                return
@@ -249,9 +269,10 @@ class EvalJudgeManualService:

        baseline_judge = None
        if baseline_transcript.strip():
-            baseline_judge = await judge.judge_conversation(
+            baseline_result = await judge.judge_conversation_result(
                full_transcript=baseline_transcript
            )
+            baseline_judge = baseline_result.output
            yield {
                "event": "baseline_judge",
                "ok": baseline_judge is not None,
@@ -261,8 +282,24 @@ class EvalJudgeManualService:
                yield {
                    "event": "error",
                    "phase": "baseline_glm",
-                    "message": "基准整体打分失败（密钥、限流或 JSON 解析失败，见服务端日志）",
+                    "message": (
+                        f"基准整体打分失败：{baseline_result.error}"
+                        if baseline_result.error
+                        else "基准整体打分失败（密钥、限流或 JSON 解析失败，见服务端日志）"
+                    ),
                }
+            elif (
+                include_baseline_turn_judges
+                and export_turns
+                and baseline_judge is not None
+            ):
+                yield {"event": "meta", "phase": "baseline_turn_judges_start"}
+                async for row in _iter_turn_judgments_for_turns(
+                    judge,
+                    export_turns,
+                    sse_event="baseline_turn_judge",
+                ):
+                    yield row
        else:
            yield {
                "event": "baseline_judge",
@@ -271,7 +308,10 @@ class EvalJudgeManualService:
                "judge": None,
            }

-        replay_judge = await judge.judge_conversation(full_transcript=replay_transcript)
+        replay_result = await judge.judge_conversation_result(
+            full_transcript=replay_transcript
+        )
+        replay_judge = replay_result.output
        yield {
            "event": "replay_judge",
            "ok": replay_judge is not None,
@@ -281,11 +321,26 @@ class EvalJudgeManualService:
            yield {
                "event": "error",
                "phase": "replay_glm",
-                "message": "回放对话整体 GLM 打分失败（空密钥、限流或 JSON 解析失败，见服务端日志）",
+                "message": (
+                    f"回放对话整体 GLM 打分失败：{replay_result.error}"
+                    if replay_result.error
+                    else "回放对话整体 GLM 打分失败（空密钥、限流或 JSON 解析失败，见服务端日志）"
+                ),
            }
            yield {"event": "done"}
            return

+        if include_turn_judges:
+            replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
+            if replay_pairs:
+                yield {"event": "meta", "phase": "replay_turn_judges_start"}
+                async for row in _iter_turn_judgments_for_turns(
+                    judge,
+                    replay_pairs,
+                    sse_event="replay_turn_judge",
+                ):
+                    yield row
+
        async for piece in judge.stream_conversation_compare(
            baseline_transcript=baseline_transcript,
            replay_transcript=replay_transcript,