feat(eval): internal-eval stack, judge fixes, and eval web overhaul
- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
This commit is contained in:
@@ -20,6 +20,10 @@ from app.features.evaluation.gate_report_service import gate_result_to_details
|
||||
from app.features.evaluation.gating_service import compute_gate
|
||||
from app.features.evaluation.judge_service import EvalJudgeService
|
||||
from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
|
||||
from app.features.evaluation.transcript_for_judge import (
|
||||
assistant_text_for_eval_display,
|
||||
format_eval_turn_block,
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -39,13 +43,23 @@ def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) ->
|
||||
|
||||
def _composite(
|
||||
conv: float | None, mem: float | None, weights: dict[str, Any] | None
|
||||
) -> float:
|
||||
) -> float | None:
|
||||
"""合成总分;缺失的一侧不计为 0,避免把评审失败误标为极差。
|
||||
|
||||
仅一侧有分:返回该侧原始分(不乘权重),表示当前 run 仅完成了部分评审维度。
|
||||
"""
|
||||
w = weights or {}
|
||||
wc = float(w.get("conversation", 0.5))
|
||||
wm = float(w.get("memoir", 0.5))
|
||||
c = float(conv or 0)
|
||||
m = float(mem or 0)
|
||||
return wc * c + wm * m
|
||||
has_c = conv is not None
|
||||
has_m = mem is not None
|
||||
if not has_c and not has_m:
|
||||
return None
|
||||
if has_c and has_m:
|
||||
return float(wc) * float(conv) + float(wm) * float(mem)
|
||||
if has_c:
|
||||
return float(conv)
|
||||
return float(mem)
|
||||
|
||||
|
||||
def _utterances_for_case(case: EvalCase) -> list[str]:
|
||||
@@ -53,11 +67,6 @@ def _utterances_for_case(case: EvalCase) -> list[str]:
|
||||
return [str(u).strip() for u in raw if str(u).strip()]
|
||||
|
||||
|
||||
def _assistant_text_for_eval_display(raw: str) -> str:
|
||||
"""评审与 transcript 展示:避免字面量 [SPLIT] 干扰 judge 阅读。"""
|
||||
return (raw or "").replace("[SPLIT]", "\n")
|
||||
|
||||
|
||||
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
@@ -72,7 +81,7 @@ def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
|
||||
if not body:
|
||||
continue
|
||||
label = "用户" if role == "human" else "AI"
|
||||
out = _assistant_text_for_eval_display(body) if role != "human" else body
|
||||
out = assistant_text_for_eval_display(body) if role != "human" else body
|
||||
parts.append(f"{label}: {out}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
@@ -179,18 +188,22 @@ async def execute_eval_run(
|
||||
if i >= len(replies):
|
||||
break
|
||||
transcript_parts.append(
|
||||
f"用户: {u}\nAI: {_assistant_text_for_eval_display(replies[i])}"
|
||||
format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i]))
|
||||
)
|
||||
prior = ""
|
||||
prior_blocks: list[str] = []
|
||||
for idx, u in enumerate(utterances):
|
||||
if idx >= len(replies):
|
||||
break
|
||||
reply = _assistant_text_for_eval_display(replies[idx])
|
||||
reply = assistant_text_for_eval_display(replies[idx])
|
||||
lat = latencies[idx] if idx < len(latencies) else None
|
||||
prior = "\n\n".join(prior_blocks)
|
||||
if len(prior) > 8000:
|
||||
prior = prior[-8000:]
|
||||
tj = await judge.judge_turn(
|
||||
prior_transcript=prior,
|
||||
user_utterance=u,
|
||||
assistant_reply=reply,
|
||||
turn_index_0=idx,
|
||||
)
|
||||
scores = tj.model_dump() if tj else None
|
||||
rationale = tj.rationale if tj else None
|
||||
@@ -205,7 +218,7 @@ async def execute_eval_run(
|
||||
judge_rationale=rationale,
|
||||
)
|
||||
await db.commit()
|
||||
prior = (prior + f"\n用户: {u}\nAI: {reply}")[-8000:]
|
||||
prior_blocks.append(format_eval_turn_block(idx, u, reply))
|
||||
|
||||
full_transcript = "\n\n".join(transcript_parts)
|
||||
conv_out = await judge.judge_conversation(full_transcript=full_transcript)
|
||||
@@ -261,6 +274,7 @@ async def execute_eval_run(
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
" 评审范围:单章节节选;跨全书连贯性仅在与证据一致时评估,否则保守打分并在 insufficient_evidence 说明。"
|
||||
),
|
||||
)
|
||||
chapter_entries.append(
|
||||
@@ -287,6 +301,7 @@ async def execute_eval_run(
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
" 评审范围:单故事节选;跨篇章关联若证据不足须保守并在 insufficient_evidence 说明。"
|
||||
),
|
||||
)
|
||||
story_entries.append(
|
||||
@@ -300,18 +315,31 @@ async def execute_eval_run(
|
||||
except Exception as e:
|
||||
logger.warning("eval story judges skipped: {}", e)
|
||||
|
||||
mem_parts: list[float] = []
|
||||
synth_scores: list[float] = []
|
||||
if mem_out is not None:
|
||||
mem_parts.append(float(mem_out.total_score))
|
||||
synth_scores.append(float(mem_out.total_score))
|
||||
|
||||
library_scores: list[float] = []
|
||||
for row in chapter_entries:
|
||||
j = row.get("judge")
|
||||
if isinstance(j, dict) and j.get("total_score") is not None:
|
||||
mem_parts.append(float(j["total_score"]))
|
||||
library_scores.append(float(j["total_score"]))
|
||||
for row in story_entries:
|
||||
j = row.get("judge")
|
||||
if isinstance(j, dict) and j.get("total_score") is not None:
|
||||
mem_parts.append(float(j["total_score"]))
|
||||
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
|
||||
library_scores.append(float(j["total_score"]))
|
||||
|
||||
def _mean(xs: list[float]) -> float:
|
||||
return sum(xs) / len(xs) if xs else 0.0
|
||||
|
||||
if synth_scores and library_scores:
|
||||
mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores)
|
||||
elif synth_scores:
|
||||
mem_total = _mean(synth_scores)
|
||||
elif library_scores:
|
||||
mem_total = _mean(library_scores)
|
||||
else:
|
||||
mem_total = None
|
||||
|
||||
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
|
||||
weights = (
|
||||
@@ -326,6 +354,17 @@ async def execute_eval_run(
|
||||
"memoir_judge": mem_out.model_dump() if mem_out else None,
|
||||
"chapters": chapter_entries,
|
||||
"stories": story_entries,
|
||||
"judge_meta": {
|
||||
"conversation_judge_ok": conv_out is not None,
|
||||
"memoir_synthetic_ok": mem_out is not None,
|
||||
"memoir_synth_scores_n": len(synth_scores),
|
||||
"memoir_library_scores_n": len(library_scores),
|
||||
"memoir_aggregate_rule": (
|
||||
"synth_plus_library_weighted_mean"
|
||||
if synth_scores and library_scores
|
||||
else ("synthetic_only" if synth_scores else "library_only")
|
||||
),
|
||||
},
|
||||
}
|
||||
await eval_repo.update_run(
|
||||
db,
|
||||
|
||||
Reference in New Issue
Block a user