feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
2026-04-07 17:15:01 +08:00
parent a50b72e7b5
commit 99543d04c6
47 changed files with 4968 additions and 1279 deletions
--- a/api/app/features/evaluation/transcript_for_judge.py
+++ b/api/app/features/evaluation/transcript_for_judge.py
@@ -0,0 +1,78 @@
+"""评测用对话文本格式化（稳定 Turn 标签、便于评审引用）。"""
+
+from __future__ import annotations
+
+from typing import Any, Protocol
+
+
+class _MessageLike(Protocol):
+    role: str | None
+    content: str | None
+
+
+def assistant_text_for_eval_display(raw: str) -> str:
+    """评审与 transcript 展示：避免字面量 [SPLIT] 干扰 judge 阅读。"""
+    return (raw or "").replace("[SPLIT]", "\n")
+
+
+def format_eval_turn_block(turn_index_0: int, user: str, assistant: str) -> str:
+    """单轮回放/节选：`[Turn k]` 从 1 起计。"""
+    u = (user or "").strip()
+    a = assistant_text_for_eval_display(assistant).strip()
+    k = int(turn_index_0) + 1
+    return f"[Turn {k}]\n用户: {u}\nAI: {a}"
+
+
+def format_export_turns_with_labels(turns: list[tuple[str, str]]) -> str:
+    """用户导出 fixture：每轮 (user, ai)。"""
+    parts: list[str] = []
+    for i, (u, ai) in enumerate(turns):
+        parts.append(format_eval_turn_block(i, u, ai))
+    return "\n\n".join(parts)
+
+
+def pair_session_messages_to_turns(messages: list[_MessageLike] | list[Any]) -> list[tuple[str, str]]:
+    """将对话消息序列为 (user, assistant) 轮次列表，语义与 `format_session_messages_with_turn_labels` 一致。
+
+    末尾仅有 human、无紧随 assistant 时，补一轮 (user, "") 供 UI 与评审对齐。
+    """
+    out: list[tuple[str, str]] = []
+    pending_user: str | None = None
+    for m in messages:
+        r = (getattr(m, "role", None) or "").lower()
+        body = (getattr(m, "content", None) or "").strip()
+        if r == "system":
+            continue
+        if not body and r != "human":
+            continue
+        if r == "human":
+            pending_user = body
+        elif r in ("ai", "assistant"):
+            u = (pending_user or "").strip()
+            pending_user = None
+            out.append((u, body))
+    if pending_user is not None:
+        out.append((pending_user.strip(), ""))
+    return out
+
+
+def format_session_messages_with_turn_labels(messages: list[_MessageLike] | list[Any]) -> str:
+    """会话消息序列：按出现顺序将相邻 human→assistant 合并为一轮。"""
+    blocks: list[str] = []
+    turn_idx = 0
+    pending_user: str | None = None
+    for m in messages:
+        r = (getattr(m, "role", None) or "").lower()
+        body = (getattr(m, "content", None) or "").strip()
+        if not body and r != "human":
+            continue
+        if r == "human":
+            pending_user = body
+        elif r in ("ai", "assistant", "system"):
+            if r == "system":
+                continue
+            u = (pending_user or "").strip()
+            pending_user = None
+            blocks.append(format_eval_turn_block(turn_idx, u, body))
+            turn_idx += 1
+    return "\n\n".join(blocks)