feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.
This commit is contained in:
Kevin
2026-04-07 17:15:01 +08:00
parent a50b72e7b5
commit 99543d04c6
47 changed files with 4968 additions and 1279 deletions

View File

@@ -15,8 +15,14 @@ from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
)
from app.features.evaluation.execution_service import _assistant_text_for_eval_display
from app.features.evaluation.judge_service import EvalJudgeService
from app.features.evaluation.transcript_for_judge import (
assistant_text_for_eval_display,
format_eval_turn_block,
format_export_turns_with_labels,
format_session_messages_with_turn_labels,
pair_session_messages_to_turns,
)
from app.features.evaluation.schemas import MemoirSectionBaselineOut
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
@@ -30,6 +36,36 @@ _MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
_PRIOR_TRANSCRIPT_MAX_CHARS = 8000
async def _iter_turn_judgments_for_turns(
judge: EvalJudgeService,
turns: list[tuple[str, str]],
*,
sse_event: str,
) -> AsyncIterator[dict[str, Any]]:
"""与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。"""
prior_blocks: list[str] = []
for idx, (u_raw, ai_raw) in enumerate(turns):
u = (u_raw or "").strip()
reply = assistant_text_for_eval_display(str(ai_raw))
prior = "\n\n".join(prior_blocks)
if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS:
prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:]
tj = await judge.judge_turn(
prior_transcript=prior,
user_utterance=u,
assistant_reply=reply,
turn_index_0=idx,
)
yield {
"event": sse_event,
"turn_index": idx,
"ok": tj is not None,
"judge": tj.model_dump() if tj else None,
}
prior_blocks.append(format_eval_turn_block(idx, u, reply))
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -39,18 +75,6 @@ def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) ->
return f"{s[:max_chars]}\n\n…(已截断供评审)"
def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
parts: list[str] = []
for u, ai in turns:
u = (u or "").strip()
ai = (ai or "").strip()
if u:
parts.append(f"用户: {u}")
if ai:
parts.append(f"AI: {_assistant_text_for_eval_display(ai)}")
return "\n\n".join(parts)
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
@@ -62,16 +86,7 @@ async def _conversation_transcript_for_manual(
db: AsyncSession, conversation_id: str
) -> str:
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
parts: list[str] = []
for row in rows:
role = (row.role or "").lower()
body = (row.content or "").strip()
if not body:
continue
label = "用户" if role == "human" else "AI"
out = _assistant_text_for_eval_display(body) if role != "human" else body
parts.append(f"{label}: {out}")
return "\n\n".join(parts)
return format_session_messages_with_turn_labels(rows)
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
@@ -125,14 +140,9 @@ class EvalJudgeManualService:
if not dialogue:
raise EvaluationNotFoundError("conversation not found")
parts: list[str] = []
for m in dialogue.messages:
r = (m.role or "").lower()
label = "用户" if r == "human" else "AI"
raw = m.content or ""
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
parts.append(f"{label}: {out}")
replay_transcript = "\n\n".join(parts)
replay_transcript = format_session_messages_with_turn_labels(
list(dialogue.messages)
)
if not replay_transcript.strip():
raise EvaluationBadRequestError("no messages to judge")
@@ -141,7 +151,7 @@ class EvalJudgeManualService:
if fn:
try:
turns, _ = read_user_export_fixture(fn)
baseline_transcript = _transcript_from_export_turns(turns)
baseline_transcript = format_export_turns_with_labels(turns)
except ValueError as e:
raise EvaluationBadRequestError(str(e)) from e
except FileNotFoundError as e:
@@ -152,18 +162,28 @@ class EvalJudgeManualService:
judge = EvalJudgeService(judge_llm)
baseline_judge_dict: dict[str, Any] | None = None
if baseline_transcript.strip():
bj = await judge.judge_conversation(full_transcript=baseline_transcript)
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
bj = baseline_result.output
if bj:
baseline_judge_dict = bj.model_dump()
else:
errors.append("baseline_glm_failed")
errors.append(
f"baseline_glm_failed: {baseline_result.error or 'unknown error'}"
)
elif fn:
errors.append("baseline_transcript_empty")
rj = await judge.judge_conversation(full_transcript=replay_transcript)
replay_result = await judge.judge_conversation_result(
full_transcript=replay_transcript
)
rj = replay_result.output
replay_judge_dict = rj.model_dump() if rj else None
if not rj:
errors.append("replay_glm_failed")
errors.append(
f"replay_glm_failed: {replay_result.error or 'unknown error'}"
)
return {
"conversation_id": cid,
@@ -179,8 +199,11 @@ class EvalJudgeManualService:
self,
conversation_id: str,
fixture_filename: str | None,
*,
include_turn_judges: bool = False,
include_baseline_turn_judges: bool = False,
) -> AsyncIterator[dict[str, Any]]:
"""供 SSE先整体基准分、再整体回放分再流式对比与建议。"""
"""供 SSE先整体基准分、再整体回放分可选逐轮分,再流式对比与建议。"""
cid = (conversation_id or "").strip()
if not cid:
yield {
@@ -200,24 +223,21 @@ class EvalJudgeManualService:
}
return
parts: list[str] = []
for m in dialogue.messages:
r = (m.role or "").lower()
label = "用户" if r == "human" else "AI"
raw = m.content or ""
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
parts.append(f"{label}: {out}")
replay_transcript = "\n\n".join(parts)
replay_transcript = format_session_messages_with_turn_labels(
list(dialogue.messages)
)
if not replay_transcript.strip():
yield {"event": "error", "phase": "load", "message": "no messages to judge"}
return
fn = (fixture_filename or "").strip() or None
baseline_transcript = ""
export_turns: list[tuple[str, str]] | None = None
if fn:
try:
turns, _ = read_user_export_fixture(fn)
baseline_transcript = _transcript_from_export_turns(turns)
export_turns = list(turns)
baseline_transcript = format_export_turns_with_labels(turns)
except ValueError as e:
yield {"event": "error", "phase": "fixture", "message": str(e)}
return
@@ -249,9 +269,10 @@ class EvalJudgeManualService:
baseline_judge = None
if baseline_transcript.strip():
baseline_judge = await judge.judge_conversation(
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
baseline_judge = baseline_result.output
yield {
"event": "baseline_judge",
"ok": baseline_judge is not None,
@@ -261,8 +282,24 @@ class EvalJudgeManualService:
yield {
"event": "error",
"phase": "baseline_glm",
"message": "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)",
"message": (
f"基准整体打分失败:{baseline_result.error}"
if baseline_result.error
else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)"
),
}
elif (
include_baseline_turn_judges
and export_turns
and baseline_judge is not None
):
yield {"event": "meta", "phase": "baseline_turn_judges_start"}
async for row in _iter_turn_judgments_for_turns(
judge,
export_turns,
sse_event="baseline_turn_judge",
):
yield row
else:
yield {
"event": "baseline_judge",
@@ -271,7 +308,10 @@ class EvalJudgeManualService:
"judge": None,
}
replay_judge = await judge.judge_conversation(full_transcript=replay_transcript)
replay_result = await judge.judge_conversation_result(
full_transcript=replay_transcript
)
replay_judge = replay_result.output
yield {
"event": "replay_judge",
"ok": replay_judge is not None,
@@ -281,11 +321,26 @@ class EvalJudgeManualService:
yield {
"event": "error",
"phase": "replay_glm",
"message": "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)",
"message": (
f"回放对话整体 GLM 打分失败:{replay_result.error}"
if replay_result.error
else "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)"
),
}
yield {"event": "done"}
return
if include_turn_judges:
replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
if replay_pairs:
yield {"event": "meta", "phase": "replay_turn_judges_start"}
async for row in _iter_turn_judgments_for_turns(
judge,
replay_pairs,
sse_event="replay_turn_judge",
):
yield row
async for piece in judge.stream_conversation_compare(
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,