feat(eval): internal-eval stack, judge fixes, and eval web overhaul
- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
This commit is contained in:
@@ -15,8 +15,14 @@ from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
)
|
||||
from app.features.evaluation.execution_service import _assistant_text_for_eval_display
|
||||
from app.features.evaluation.judge_service import EvalJudgeService
|
||||
from app.features.evaluation.transcript_for_judge import (
|
||||
assistant_text_for_eval_display,
|
||||
format_eval_turn_block,
|
||||
format_export_turns_with_labels,
|
||||
format_session_messages_with_turn_labels,
|
||||
pair_session_messages_to_turns,
|
||||
)
|
||||
from app.features.evaluation.schemas import MemoirSectionBaselineOut
|
||||
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
||||
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
|
||||
@@ -30,6 +36,36 @@ _MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
_MAX_EVIDENCE_CONVERSATIONS = 8
|
||||
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
||||
_PRIOR_TRANSCRIPT_MAX_CHARS = 8000
|
||||
|
||||
|
||||
async def _iter_turn_judgments_for_turns(
|
||||
judge: EvalJudgeService,
|
||||
turns: list[tuple[str, str]],
|
||||
*,
|
||||
sse_event: str,
|
||||
) -> AsyncIterator[dict[str, Any]]:
|
||||
"""与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。"""
|
||||
prior_blocks: list[str] = []
|
||||
for idx, (u_raw, ai_raw) in enumerate(turns):
|
||||
u = (u_raw or "").strip()
|
||||
reply = assistant_text_for_eval_display(str(ai_raw))
|
||||
prior = "\n\n".join(prior_blocks)
|
||||
if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS:
|
||||
prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:]
|
||||
tj = await judge.judge_turn(
|
||||
prior_transcript=prior,
|
||||
user_utterance=u,
|
||||
assistant_reply=reply,
|
||||
turn_index_0=idx,
|
||||
)
|
||||
yield {
|
||||
"event": sse_event,
|
||||
"turn_index": idx,
|
||||
"ok": tj is not None,
|
||||
"judge": tj.model_dump() if tj else None,
|
||||
}
|
||||
prior_blocks.append(format_eval_turn_block(idx, u, reply))
|
||||
|
||||
|
||||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||||
@@ -39,18 +75,6 @@ def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) ->
|
||||
return f"{s[:max_chars]}\n\n…(已截断供评审)"
|
||||
|
||||
|
||||
def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
|
||||
parts: list[str] = []
|
||||
for u, ai in turns:
|
||||
u = (u or "").strip()
|
||||
ai = (ai or "").strip()
|
||||
if u:
|
||||
parts.append(f"用户: {u}")
|
||||
if ai:
|
||||
parts.append(f"AI: {_assistant_text_for_eval_display(ai)}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
@@ -62,16 +86,7 @@ async def _conversation_transcript_for_manual(
|
||||
db: AsyncSession, conversation_id: str
|
||||
) -> str:
|
||||
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
||||
parts: list[str] = []
|
||||
for row in rows:
|
||||
role = (row.role or "").lower()
|
||||
body = (row.content or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
label = "用户" if role == "human" else "AI"
|
||||
out = _assistant_text_for_eval_display(body) if role != "human" else body
|
||||
parts.append(f"{label}: {out}")
|
||||
return "\n\n".join(parts)
|
||||
return format_session_messages_with_turn_labels(rows)
|
||||
|
||||
|
||||
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
|
||||
@@ -125,14 +140,9 @@ class EvalJudgeManualService:
|
||||
if not dialogue:
|
||||
raise EvaluationNotFoundError("conversation not found")
|
||||
|
||||
parts: list[str] = []
|
||||
for m in dialogue.messages:
|
||||
r = (m.role or "").lower()
|
||||
label = "用户" if r == "human" else "AI"
|
||||
raw = m.content or ""
|
||||
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
|
||||
parts.append(f"{label}: {out}")
|
||||
replay_transcript = "\n\n".join(parts)
|
||||
replay_transcript = format_session_messages_with_turn_labels(
|
||||
list(dialogue.messages)
|
||||
)
|
||||
if not replay_transcript.strip():
|
||||
raise EvaluationBadRequestError("no messages to judge")
|
||||
|
||||
@@ -141,7 +151,7 @@ class EvalJudgeManualService:
|
||||
if fn:
|
||||
try:
|
||||
turns, _ = read_user_export_fixture(fn)
|
||||
baseline_transcript = _transcript_from_export_turns(turns)
|
||||
baseline_transcript = format_export_turns_with_labels(turns)
|
||||
except ValueError as e:
|
||||
raise EvaluationBadRequestError(str(e)) from e
|
||||
except FileNotFoundError as e:
|
||||
@@ -152,18 +162,28 @@ class EvalJudgeManualService:
|
||||
judge = EvalJudgeService(judge_llm)
|
||||
baseline_judge_dict: dict[str, Any] | None = None
|
||||
if baseline_transcript.strip():
|
||||
bj = await judge.judge_conversation(full_transcript=baseline_transcript)
|
||||
baseline_result = await judge.judge_conversation_result(
|
||||
full_transcript=baseline_transcript
|
||||
)
|
||||
bj = baseline_result.output
|
||||
if bj:
|
||||
baseline_judge_dict = bj.model_dump()
|
||||
else:
|
||||
errors.append("baseline_glm_failed")
|
||||
errors.append(
|
||||
f"baseline_glm_failed: {baseline_result.error or 'unknown error'}"
|
||||
)
|
||||
elif fn:
|
||||
errors.append("baseline_transcript_empty")
|
||||
|
||||
rj = await judge.judge_conversation(full_transcript=replay_transcript)
|
||||
replay_result = await judge.judge_conversation_result(
|
||||
full_transcript=replay_transcript
|
||||
)
|
||||
rj = replay_result.output
|
||||
replay_judge_dict = rj.model_dump() if rj else None
|
||||
if not rj:
|
||||
errors.append("replay_glm_failed")
|
||||
errors.append(
|
||||
f"replay_glm_failed: {replay_result.error or 'unknown error'}"
|
||||
)
|
||||
|
||||
return {
|
||||
"conversation_id": cid,
|
||||
@@ -179,8 +199,11 @@ class EvalJudgeManualService:
|
||||
self,
|
||||
conversation_id: str,
|
||||
fixture_filename: str | None,
|
||||
*,
|
||||
include_turn_judges: bool = False,
|
||||
include_baseline_turn_judges: bool = False,
|
||||
) -> AsyncIterator[dict[str, Any]]:
|
||||
"""供 SSE:先整体基准分、再整体回放分,再流式对比与建议。"""
|
||||
"""供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议。"""
|
||||
cid = (conversation_id or "").strip()
|
||||
if not cid:
|
||||
yield {
|
||||
@@ -200,24 +223,21 @@ class EvalJudgeManualService:
|
||||
}
|
||||
return
|
||||
|
||||
parts: list[str] = []
|
||||
for m in dialogue.messages:
|
||||
r = (m.role or "").lower()
|
||||
label = "用户" if r == "human" else "AI"
|
||||
raw = m.content or ""
|
||||
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
|
||||
parts.append(f"{label}: {out}")
|
||||
replay_transcript = "\n\n".join(parts)
|
||||
replay_transcript = format_session_messages_with_turn_labels(
|
||||
list(dialogue.messages)
|
||||
)
|
||||
if not replay_transcript.strip():
|
||||
yield {"event": "error", "phase": "load", "message": "no messages to judge"}
|
||||
return
|
||||
|
||||
fn = (fixture_filename or "").strip() or None
|
||||
baseline_transcript = ""
|
||||
export_turns: list[tuple[str, str]] | None = None
|
||||
if fn:
|
||||
try:
|
||||
turns, _ = read_user_export_fixture(fn)
|
||||
baseline_transcript = _transcript_from_export_turns(turns)
|
||||
export_turns = list(turns)
|
||||
baseline_transcript = format_export_turns_with_labels(turns)
|
||||
except ValueError as e:
|
||||
yield {"event": "error", "phase": "fixture", "message": str(e)}
|
||||
return
|
||||
@@ -249,9 +269,10 @@ class EvalJudgeManualService:
|
||||
|
||||
baseline_judge = None
|
||||
if baseline_transcript.strip():
|
||||
baseline_judge = await judge.judge_conversation(
|
||||
baseline_result = await judge.judge_conversation_result(
|
||||
full_transcript=baseline_transcript
|
||||
)
|
||||
baseline_judge = baseline_result.output
|
||||
yield {
|
||||
"event": "baseline_judge",
|
||||
"ok": baseline_judge is not None,
|
||||
@@ -261,8 +282,24 @@ class EvalJudgeManualService:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "baseline_glm",
|
||||
"message": "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)",
|
||||
"message": (
|
||||
f"基准整体打分失败:{baseline_result.error}"
|
||||
if baseline_result.error
|
||||
else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)"
|
||||
),
|
||||
}
|
||||
elif (
|
||||
include_baseline_turn_judges
|
||||
and export_turns
|
||||
and baseline_judge is not None
|
||||
):
|
||||
yield {"event": "meta", "phase": "baseline_turn_judges_start"}
|
||||
async for row in _iter_turn_judgments_for_turns(
|
||||
judge,
|
||||
export_turns,
|
||||
sse_event="baseline_turn_judge",
|
||||
):
|
||||
yield row
|
||||
else:
|
||||
yield {
|
||||
"event": "baseline_judge",
|
||||
@@ -271,7 +308,10 @@ class EvalJudgeManualService:
|
||||
"judge": None,
|
||||
}
|
||||
|
||||
replay_judge = await judge.judge_conversation(full_transcript=replay_transcript)
|
||||
replay_result = await judge.judge_conversation_result(
|
||||
full_transcript=replay_transcript
|
||||
)
|
||||
replay_judge = replay_result.output
|
||||
yield {
|
||||
"event": "replay_judge",
|
||||
"ok": replay_judge is not None,
|
||||
@@ -281,11 +321,26 @@ class EvalJudgeManualService:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "replay_glm",
|
||||
"message": "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)",
|
||||
"message": (
|
||||
f"回放对话整体 GLM 打分失败:{replay_result.error}"
|
||||
if replay_result.error
|
||||
else "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)"
|
||||
),
|
||||
}
|
||||
yield {"event": "done"}
|
||||
return
|
||||
|
||||
if include_turn_judges:
|
||||
replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
|
||||
if replay_pairs:
|
||||
yield {"event": "meta", "phase": "replay_turn_judges_start"}
|
||||
async for row in _iter_turn_judgments_for_turns(
|
||||
judge,
|
||||
replay_pairs,
|
||||
sse_event="replay_turn_judge",
|
||||
):
|
||||
yield row
|
||||
|
||||
async for piece in judge.stream_conversation_compare(
|
||||
baseline_transcript=baseline_transcript,
|
||||
replay_transcript=replay_transcript,
|
||||
|
||||
Reference in New Issue
Block a user