feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.
This commit is contained in:
Kevin
2026-04-07 17:15:01 +08:00
parent a50b72e7b5
commit 99543d04c6
47 changed files with 4968 additions and 1279 deletions

View File

@@ -20,6 +20,10 @@ from app.features.evaluation.gate_report_service import gate_result_to_details
from app.features.evaluation.gating_service import compute_gate
from app.features.evaluation.judge_service import EvalJudgeService
from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
from app.features.evaluation.transcript_for_judge import (
assistant_text_for_eval_display,
format_eval_turn_block,
)
logger = get_logger(__name__)
@@ -39,13 +43,23 @@ def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) ->
def _composite(
conv: float | None, mem: float | None, weights: dict[str, Any] | None
) -> float:
) -> float | None:
"""合成总分;缺失的一侧不计为 0避免把评审失败误标为极差。
仅一侧有分:返回该侧原始分(不乘权重),表示当前 run 仅完成了部分评审维度。
"""
w = weights or {}
wc = float(w.get("conversation", 0.5))
wm = float(w.get("memoir", 0.5))
c = float(conv or 0)
m = float(mem or 0)
return wc * c + wm * m
has_c = conv is not None
has_m = mem is not None
if not has_c and not has_m:
return None
if has_c and has_m:
return float(wc) * float(conv) + float(wm) * float(mem)
if has_c:
return float(conv)
return float(mem)
def _utterances_for_case(case: EvalCase) -> list[str]:
@@ -53,11 +67,6 @@ def _utterances_for_case(case: EvalCase) -> list[str]:
return [str(u).strip() for u in raw if str(u).strip()]
def _assistant_text_for_eval_display(raw: str) -> str:
"""评审与 transcript 展示:避免字面量 [SPLIT] 干扰 judge 阅读。"""
return (raw or "").replace("[SPLIT]", "\n")
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
@@ -72,7 +81,7 @@ def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
if not body:
continue
label = "用户" if role == "human" else "AI"
out = _assistant_text_for_eval_display(body) if role != "human" else body
out = assistant_text_for_eval_display(body) if role != "human" else body
parts.append(f"{label}: {out}")
return "\n\n".join(parts)
@@ -179,18 +188,22 @@ async def execute_eval_run(
if i >= len(replies):
break
transcript_parts.append(
f"用户: {u}\nAI: {_assistant_text_for_eval_display(replies[i])}"
format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i]))
)
prior = ""
prior_blocks: list[str] = []
for idx, u in enumerate(utterances):
if idx >= len(replies):
break
reply = _assistant_text_for_eval_display(replies[idx])
reply = assistant_text_for_eval_display(replies[idx])
lat = latencies[idx] if idx < len(latencies) else None
prior = "\n\n".join(prior_blocks)
if len(prior) > 8000:
prior = prior[-8000:]
tj = await judge.judge_turn(
prior_transcript=prior,
user_utterance=u,
assistant_reply=reply,
turn_index_0=idx,
)
scores = tj.model_dump() if tj else None
rationale = tj.rationale if tj else None
@@ -205,7 +218,7 @@ async def execute_eval_run(
judge_rationale=rationale,
)
await db.commit()
prior = (prior + f"\n用户: {u}\nAI: {reply}")[-8000:]
prior_blocks.append(format_eval_turn_block(idx, u, reply))
full_transcript = "\n\n".join(transcript_parts)
conv_out = await judge.judge_conversation(full_transcript=full_transcript)
@@ -261,6 +274,7 @@ async def execute_eval_run(
reference_memoir_markdown=reference_memoir,
evidence_notes=(
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
" 评审范围:单章节节选;跨全书连贯性仅在与证据一致时评估,否则保守打分并在 insufficient_evidence 说明。"
),
)
chapter_entries.append(
@@ -287,6 +301,7 @@ async def execute_eval_run(
reference_memoir_markdown=reference_memoir,
evidence_notes=(
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
" 评审范围:单故事节选;跨篇章关联若证据不足须保守并在 insufficient_evidence 说明。"
),
)
story_entries.append(
@@ -300,18 +315,31 @@ async def execute_eval_run(
except Exception as e:
logger.warning("eval story judges skipped: {}", e)
mem_parts: list[float] = []
synth_scores: list[float] = []
if mem_out is not None:
mem_parts.append(float(mem_out.total_score))
synth_scores.append(float(mem_out.total_score))
library_scores: list[float] = []
for row in chapter_entries:
j = row.get("judge")
if isinstance(j, dict) and j.get("total_score") is not None:
mem_parts.append(float(j["total_score"]))
library_scores.append(float(j["total_score"]))
for row in story_entries:
j = row.get("judge")
if isinstance(j, dict) and j.get("total_score") is not None:
mem_parts.append(float(j["total_score"]))
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
library_scores.append(float(j["total_score"]))
def _mean(xs: list[float]) -> float:
return sum(xs) / len(xs) if xs else 0.0
if synth_scores and library_scores:
mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores)
elif synth_scores:
mem_total = _mean(synth_scores)
elif library_scores:
mem_total = _mean(library_scores)
else:
mem_total = None
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
weights = (
@@ -326,6 +354,17 @@ async def execute_eval_run(
"memoir_judge": mem_out.model_dump() if mem_out else None,
"chapters": chapter_entries,
"stories": story_entries,
"judge_meta": {
"conversation_judge_ok": conv_out is not None,
"memoir_synthetic_ok": mem_out is not None,
"memoir_synth_scores_n": len(synth_scores),
"memoir_library_scores_n": len(library_scores),
"memoir_aggregate_rule": (
"synth_plus_library_weighted_mean"
if synth_scores and library_scores
else ("synthetic_only" if synth_scores else "library_only")
),
},
}
await eval_repo.update_run(
db,