1147 lines
44 KiB
Python
1147 lines
44 KiB
Python
"""手动触发评测台评审(智谱 / DeepSeek;不写 eval_runs;Playground 对话评分写入 conversations 表)。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import copy
|
||
import re
|
||
from collections.abc import AsyncIterator
|
||
from datetime import datetime, timezone
|
||
from typing import Any
|
||
|
||
from sqlalchemy.ext.asyncio import AsyncSession
|
||
|
||
from app.core.config import settings
|
||
from app.core.dependencies import (
|
||
EvalJudgeProvider,
|
||
build_eval_judge_llm_spec,
|
||
)
|
||
from app.core.logging import get_logger
|
||
from app.features.conversation import repo as conversation_repo
|
||
from app.features.evaluation.conversation_compare_summary import (
|
||
build_conversation_compare_summary,
|
||
)
|
||
from app.features.evaluation.errors import (
|
||
EvaluationBadRequestError,
|
||
EvaluationNotFoundError,
|
||
)
|
||
from app.features.evaluation.eval_trace_service import EvalTraceService
|
||
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
|
||
from app.features.evaluation.judge_service import (
|
||
EvalJudgeService,
|
||
eval_judge_compare_bundle_caps,
|
||
eval_judge_conversation_transcript_max_chars_for_context,
|
||
)
|
||
from app.features.evaluation.memoir_compare_summary import (
|
||
build_memoir_compare_summary,
|
||
)
|
||
from app.features.evaluation.schemas import MemoirSectionBaselineOut
|
||
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
||
from app.features.evaluation.transcript_for_judge import (
|
||
assistant_text_for_eval_display,
|
||
format_eval_turn_block,
|
||
format_export_turns_with_labels,
|
||
format_session_messages_with_turn_labels,
|
||
pair_session_messages_to_turns,
|
||
)
|
||
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
|
||
from app.features.memoir.repo import get_chapters_for_memoir_list
|
||
from app.features.story.repo import get_stories_for_user
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
_MAX_EVAL_CHAPTERS = 30
|
||
_MAX_EVAL_STORIES = 40 # memoir_snapshot 等仍限幅
|
||
_PRIOR_TRANSCRIPT_MAX_CHARS = 8000
|
||
|
||
_JUDGE_CONFIG_HINT = (
|
||
"评审未配置:智谱需 eval_judge_api_key 或 zhipu_api_key;"
|
||
"DeepSeek 需 deepseek_api_key(或 llm_api_key)"
|
||
)
|
||
|
||
|
||
def _make_eval_judge(
|
||
judge_provider: EvalJudgeProvider,
|
||
judge_model: str | None,
|
||
) -> tuple[EvalJudgeService | None, str]:
|
||
spec = build_eval_judge_llm_spec(judge_provider, judge_model)
|
||
if not spec or not spec.llm:
|
||
return None, ""
|
||
return (
|
||
EvalJudgeService(
|
||
spec.llm,
|
||
context_window_tokens=spec.context_window_tokens,
|
||
),
|
||
spec.resolved_model,
|
||
)
|
||
|
||
|
||
def _strip_baseline_judge_errors(errs: list[Any]) -> list[str]:
|
||
out: list[str] = []
|
||
for e in errs:
|
||
s = str(e) if e is not None else ""
|
||
if not s.strip():
|
||
continue
|
||
if (
|
||
"基准整体打分失败" in s
|
||
or s.startswith("baseline_glm5:")
|
||
or "baseline_glm5_failed:" in s
|
||
):
|
||
continue
|
||
out.append(s)
|
||
return out
|
||
|
||
|
||
async def _iter_turn_judgments_for_turns(
|
||
judge: EvalJudgeService,
|
||
turns: list[tuple[str, str]],
|
||
*,
|
||
sse_event: str,
|
||
) -> AsyncIterator[dict[str, Any]]:
|
||
"""与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。"""
|
||
prior_blocks: list[str] = []
|
||
for idx, (u_raw, ai_raw) in enumerate(turns):
|
||
u = (u_raw or "").strip()
|
||
reply = assistant_text_for_eval_display(str(ai_raw))
|
||
prior = "\n\n".join(prior_blocks)
|
||
if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS:
|
||
prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:]
|
||
tj = await judge.judge_turn(
|
||
prior_transcript=prior,
|
||
user_utterance=u,
|
||
assistant_reply=reply,
|
||
turn_index_0=idx,
|
||
)
|
||
yield {
|
||
"event": sse_event,
|
||
"turn_index": idx,
|
||
"ok": tj is not None,
|
||
"judge": tj.model_dump() if tj else None,
|
||
}
|
||
prior_blocks.append(format_eval_turn_block(idx, u, reply))
|
||
|
||
|
||
def _clip_md_for_judge(text: str, max_chars: int | None = None) -> str:
|
||
cap = (
|
||
max_chars
|
||
if max_chars is not None
|
||
else max(1000, int(settings.eval_judge_memoir_body_max_chars))
|
||
)
|
||
s = (text or "").strip()
|
||
if len(s) <= cap:
|
||
return s
|
||
return f"{s[:cap]}\n\n…(已截断供评审)"
|
||
|
||
|
||
async def _conversation_transcript_for_manual(
|
||
db: AsyncSession, conversation_id: str
|
||
) -> str:
|
||
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
||
return format_session_messages_with_turn_labels(rows)
|
||
|
||
|
||
def _normalize_title_key(title: str) -> str:
|
||
t = (title or "").strip().lower()
|
||
t = re.sub(r"^#+\s*", "", t)
|
||
return re.sub(r"\s+", " ", t)
|
||
|
||
|
||
def _baseline_for_chapter_title(
|
||
baselines: list[MemoirSectionBaselineOut],
|
||
chapter_title: str,
|
||
index: int,
|
||
) -> MemoirSectionBaselineOut | None:
|
||
if baselines:
|
||
key = _normalize_title_key(chapter_title)
|
||
for b in baselines:
|
||
if _normalize_title_key(b.title) == key:
|
||
return b
|
||
if 0 <= index < len(baselines):
|
||
return baselines[index]
|
||
return None
|
||
|
||
|
||
class EvalJudgeManualService:
|
||
def __init__(self, db: AsyncSession) -> None:
|
||
self._db = db
|
||
|
||
async def _persist_playground_conversation_judge(
|
||
self, conversation_id: str, bundle: dict[str, Any]
|
||
) -> None:
|
||
try:
|
||
row = await conversation_repo.set_playground_conversation_judge_json(
|
||
conversation_id, self._db, bundle
|
||
)
|
||
if row is not None:
|
||
await self._db.commit()
|
||
except Exception:
|
||
logger.exception(
|
||
"persist playground_conversation_judge_json failed conversation_id={}",
|
||
conversation_id,
|
||
)
|
||
|
||
async def judge_conversation(
|
||
self,
|
||
conversation_id: str,
|
||
fixture_filename: str | None,
|
||
*,
|
||
judge_provider: EvalJudgeProvider = "zhipu",
|
||
judge_model: str | None = None,
|
||
) -> dict[str, Any]:
|
||
cid = (conversation_id or "").strip()
|
||
if not cid:
|
||
raise EvaluationBadRequestError("conversation_id is required")
|
||
|
||
catalog = SessionCatalogService(self._db)
|
||
dialogue = await catalog.get_session_dialogue(cid)
|
||
if not dialogue:
|
||
raise EvaluationNotFoundError("conversation not found")
|
||
|
||
replay_transcript = format_session_messages_with_turn_labels(
|
||
list(dialogue.messages)
|
||
)
|
||
if not replay_transcript.strip():
|
||
raise EvaluationBadRequestError("no messages to judge")
|
||
|
||
fn = (fixture_filename or "").strip() or None
|
||
baseline_transcript = ""
|
||
if fn:
|
||
try:
|
||
turns, _ = read_user_export_fixture(fn)
|
||
baseline_transcript = format_export_turns_with_labels(turns)
|
||
except ValueError as e:
|
||
raise EvaluationBadRequestError(str(e)) from e
|
||
except FileNotFoundError as e:
|
||
raise EvaluationNotFoundError("fixture not found") from e
|
||
|
||
errors: list[str] = []
|
||
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
|
||
if not judge:
|
||
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
|
||
baseline_judge_dict: dict[str, Any] | None = None
|
||
if baseline_transcript.strip():
|
||
baseline_result = await judge.judge_conversation_result(
|
||
full_transcript=baseline_transcript
|
||
)
|
||
bj = baseline_result.output
|
||
if bj:
|
||
baseline_judge_dict = bj.model_dump()
|
||
else:
|
||
errors.append(
|
||
f"baseline_glm5_failed: {baseline_result.error or 'unknown error'}"
|
||
)
|
||
elif fn:
|
||
errors.append("baseline_transcript_empty")
|
||
|
||
replay_result = await judge.judge_conversation_result(
|
||
full_transcript=replay_transcript
|
||
)
|
||
rj = replay_result.output
|
||
replay_judge_dict = rj.model_dump() if rj else None
|
||
if not rj:
|
||
errors.append(
|
||
f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
|
||
)
|
||
|
||
_cmp_total, _cmp_per_side = eval_judge_compare_bundle_caps(judge._ctx_tokens)
|
||
bundle: dict[str, Any] = {
|
||
"version": 1,
|
||
"judged_at": datetime.now(timezone.utc).isoformat(),
|
||
"fixture_filename": fn,
|
||
"baseline_judge": baseline_judge_dict,
|
||
"replay_judge": replay_judge_dict,
|
||
"baseline_turn_judges": {},
|
||
"replay_turn_judges": {},
|
||
"compare_summary": build_conversation_compare_summary(
|
||
baseline_judge=bj,
|
||
replay_judge=rj,
|
||
baseline_transcript=baseline_transcript,
|
||
replay_transcript=replay_transcript,
|
||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||
judge._ctx_tokens
|
||
),
|
||
compare_cap_total=_cmp_total,
|
||
compare_per_side_cap=_cmp_per_side,
|
||
fixture_filename=fn,
|
||
),
|
||
"compare_markdown": "",
|
||
"errors": list(errors),
|
||
"warnings": [],
|
||
"options": {
|
||
"include_turn_judges": False,
|
||
"include_baseline_turn_judges": False,
|
||
"judge_provider": judge_provider,
|
||
"judge_model": resolved_model,
|
||
},
|
||
}
|
||
await self._persist_playground_conversation_judge(cid, bundle)
|
||
|
||
return {
|
||
"conversation_id": cid,
|
||
"fixture_filename": fn,
|
||
"baseline_transcript": baseline_transcript,
|
||
"replay_transcript": replay_transcript,
|
||
"baseline_judge": baseline_judge_dict,
|
||
"replay_judge": replay_judge_dict,
|
||
"compare_summary": bundle.get("compare_summary"),
|
||
"errors": errors,
|
||
}
|
||
|
||
async def iter_conversation_judge_sse(
|
||
self,
|
||
conversation_id: str,
|
||
fixture_filename: str | None,
|
||
*,
|
||
include_turn_judges: bool = False,
|
||
include_baseline_turn_judges: bool = False,
|
||
judge_provider: EvalJudgeProvider = "zhipu",
|
||
judge_model: str | None = None,
|
||
) -> AsyncIterator[dict[str, Any]]:
|
||
"""供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议;成功后写入 playground 字段。"""
|
||
acc: dict[str, Any] = {
|
||
"version": 1,
|
||
"fixture_filename": None,
|
||
"baseline_judge": None,
|
||
"replay_judge": None,
|
||
"baseline_turn_judges": {},
|
||
"replay_turn_judges": {},
|
||
"compare_summary": None,
|
||
"compare_markdown": "",
|
||
"errors": [],
|
||
"warnings": [],
|
||
"options": {
|
||
"include_turn_judges": include_turn_judges,
|
||
"include_baseline_turn_judges": include_baseline_turn_judges,
|
||
"judge_provider": judge_provider,
|
||
"judge_model": "",
|
||
},
|
||
}
|
||
cid = (conversation_id or "").strip()
|
||
if not cid:
|
||
yield {
|
||
"event": "error",
|
||
"phase": "validate",
|
||
"message": "conversation_id is required",
|
||
}
|
||
return
|
||
|
||
catalog = SessionCatalogService(self._db)
|
||
dialogue = await catalog.get_session_dialogue(cid)
|
||
if not dialogue:
|
||
yield {
|
||
"event": "error",
|
||
"phase": "load",
|
||
"message": "conversation not found",
|
||
}
|
||
return
|
||
|
||
replay_transcript = format_session_messages_with_turn_labels(
|
||
list(dialogue.messages)
|
||
)
|
||
if not replay_transcript.strip():
|
||
yield {"event": "error", "phase": "load", "message": "no messages to judge"}
|
||
return
|
||
|
||
fn = (fixture_filename or "").strip() or None
|
||
baseline_transcript = ""
|
||
export_turns: list[tuple[str, str]] | None = None
|
||
if fn:
|
||
try:
|
||
turns, _ = read_user_export_fixture(fn)
|
||
export_turns = list(turns)
|
||
baseline_transcript = format_export_turns_with_labels(turns)
|
||
except ValueError as e:
|
||
yield {"event": "error", "phase": "fixture", "message": str(e)}
|
||
return
|
||
except FileNotFoundError:
|
||
yield {
|
||
"event": "error",
|
||
"phase": "fixture",
|
||
"message": "fixture not found",
|
||
}
|
||
return
|
||
|
||
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
|
||
if not judge:
|
||
yield {
|
||
"event": "error",
|
||
"phase": "config",
|
||
"message": _JUDGE_CONFIG_HINT,
|
||
}
|
||
return
|
||
|
||
acc["options"]["judge_model"] = resolved_model
|
||
acc["fixture_filename"] = fn
|
||
_sse_cmp_total, _sse_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
|
||
persist = True
|
||
try:
|
||
yield {
|
||
"event": "meta",
|
||
"conversation_id": cid,
|
||
"fixture_filename": fn,
|
||
"judge_provider": judge_provider,
|
||
"judge_model": resolved_model,
|
||
}
|
||
|
||
if not baseline_transcript.strip():
|
||
wmsg = "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议"
|
||
acc["warnings"].append(wmsg)
|
||
yield {"event": "warning", "message": wmsg}
|
||
|
||
baseline_judge = None
|
||
if baseline_transcript.strip():
|
||
baseline_result = await judge.judge_conversation_result(
|
||
full_transcript=baseline_transcript
|
||
)
|
||
baseline_judge = baseline_result.output
|
||
acc["baseline_judge"] = (
|
||
baseline_judge.model_dump() if baseline_judge else None
|
||
)
|
||
yield {
|
||
"event": "baseline_judge",
|
||
"ok": baseline_judge is not None,
|
||
"judge": acc["baseline_judge"],
|
||
}
|
||
if not baseline_judge:
|
||
err = (
|
||
f"基准整体打分失败:{baseline_result.error}"
|
||
if baseline_result.error
|
||
else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)"
|
||
)
|
||
acc["errors"].append(err)
|
||
yield {
|
||
"event": "error",
|
||
"phase": "baseline_glm5",
|
||
"message": err,
|
||
}
|
||
elif (
|
||
include_baseline_turn_judges
|
||
and export_turns
|
||
and baseline_judge is not None
|
||
):
|
||
yield {"event": "meta", "phase": "baseline_turn_judges_start"}
|
||
async for row in _iter_turn_judgments_for_turns(
|
||
judge,
|
||
export_turns,
|
||
sse_event="baseline_turn_judge",
|
||
):
|
||
if row.get("event") == "baseline_turn_judge":
|
||
idx = row.get("turn_index")
|
||
if isinstance(idx, (int, float)):
|
||
acc["baseline_turn_judges"][str(int(idx))] = row.get(
|
||
"judge"
|
||
)
|
||
yield row
|
||
else:
|
||
acc["baseline_judge"] = None
|
||
yield {
|
||
"event": "baseline_judge",
|
||
"ok": False,
|
||
"skipped": True,
|
||
"judge": None,
|
||
}
|
||
|
||
replay_result = await judge.judge_conversation_result(
|
||
full_transcript=replay_transcript
|
||
)
|
||
replay_judge = replay_result.output
|
||
acc["replay_judge"] = replay_judge.model_dump() if replay_judge else None
|
||
acc["compare_summary"] = build_conversation_compare_summary(
|
||
baseline_judge=baseline_judge,
|
||
replay_judge=replay_judge,
|
||
baseline_transcript=baseline_transcript,
|
||
replay_transcript=replay_transcript,
|
||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||
judge._ctx_tokens
|
||
),
|
||
compare_cap_total=_sse_cmp_total,
|
||
compare_per_side_cap=_sse_cmp_per,
|
||
fixture_filename=fn,
|
||
)
|
||
yield {
|
||
"event": "replay_judge",
|
||
"ok": replay_judge is not None,
|
||
"judge": acc["replay_judge"],
|
||
}
|
||
yield {"event": "compare_summary", "summary": acc["compare_summary"]}
|
||
if not replay_judge:
|
||
err = (
|
||
f"回放对话整体打分失败:{replay_result.error}"
|
||
if replay_result.error
|
||
else "回放对话整体打分失败(限流或 JSON 解析失败,见服务端日志)"
|
||
)
|
||
acc["errors"].append(err)
|
||
yield {
|
||
"event": "error",
|
||
"phase": "replay_glm5",
|
||
"message": err,
|
||
}
|
||
yield {"event": "done"}
|
||
return
|
||
|
||
if include_turn_judges:
|
||
replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
|
||
if replay_pairs:
|
||
yield {"event": "meta", "phase": "replay_turn_judges_start"}
|
||
async for row in _iter_turn_judgments_for_turns(
|
||
judge,
|
||
replay_pairs,
|
||
sse_event="replay_turn_judge",
|
||
):
|
||
if row.get("event") == "replay_turn_judge":
|
||
idx = row.get("turn_index")
|
||
if isinstance(idx, (int, float)):
|
||
acc["replay_turn_judges"][str(int(idx))] = row.get(
|
||
"judge"
|
||
)
|
||
yield row
|
||
|
||
async for piece in judge.stream_conversation_compare(
|
||
baseline_transcript=baseline_transcript,
|
||
replay_transcript=replay_transcript,
|
||
baseline_judge=baseline_judge,
|
||
replay_judge=replay_judge,
|
||
):
|
||
if piece:
|
||
acc["compare_markdown"] += piece
|
||
yield {"event": "compare_delta", "text": piece}
|
||
|
||
yield {"event": "done"}
|
||
finally:
|
||
if persist:
|
||
acc["judged_at"] = datetime.now(timezone.utc).isoformat()
|
||
await self._persist_playground_conversation_judge(cid, acc)
|
||
|
||
async def retry_baseline_conversation_judge(
|
||
self,
|
||
conversation_id: str,
|
||
fixture_filename: str | None,
|
||
*,
|
||
include_baseline_turn_judges: bool = False,
|
||
judge_provider: EvalJudgeProvider = "zhipu",
|
||
judge_model: str | None = None,
|
||
) -> dict[str, Any]:
|
||
"""仅重试导出基线整体 GLM 分(及可选基线逐轮),并基于已有 replay 分重新生成对比稿。"""
|
||
cid = (conversation_id or "").strip()
|
||
if not cid:
|
||
raise EvaluationBadRequestError("conversation_id is required")
|
||
|
||
catalog = SessionCatalogService(self._db)
|
||
dialogue = await catalog.get_session_dialogue(cid)
|
||
if not dialogue:
|
||
raise EvaluationNotFoundError("conversation not found")
|
||
|
||
replay_transcript = format_session_messages_with_turn_labels(
|
||
list(dialogue.messages)
|
||
)
|
||
if not replay_transcript.strip():
|
||
raise EvaluationBadRequestError("no messages to judge")
|
||
|
||
fn = (fixture_filename or "").strip() or None
|
||
if not fn:
|
||
raise EvaluationBadRequestError(
|
||
"请选择基线 MD(fixture_filename)后再重试基准分"
|
||
)
|
||
|
||
try:
|
||
turns, _ = read_user_export_fixture(fn)
|
||
export_turns = list(turns)
|
||
baseline_transcript = format_export_turns_with_labels(turns)
|
||
except ValueError as e:
|
||
raise EvaluationBadRequestError(str(e)) from e
|
||
except FileNotFoundError:
|
||
raise EvaluationNotFoundError("fixture not found") from None
|
||
|
||
if not baseline_transcript.strip():
|
||
raise EvaluationBadRequestError("baseline transcript is empty")
|
||
|
||
prev = await catalog.get_playground_conversation_judge_json(cid)
|
||
if not prev or not isinstance(prev, dict):
|
||
raise EvaluationBadRequestError(
|
||
"服务端没有已保存的评分草稿:请先跑一次「自动评分(流式)」"
|
||
"直到回放侧打分完成,再使用本重试。"
|
||
)
|
||
raw_replay = prev.get("replay_judge")
|
||
if not raw_replay or not isinstance(raw_replay, dict):
|
||
raise EvaluationBadRequestError(
|
||
"已保存结果中缺少回放侧整体分:请先完成流式评分中的回放打分阶段再重试基准。"
|
||
)
|
||
|
||
try:
|
||
replay_model = ConversationJudgeOutput.model_validate(raw_replay)
|
||
except Exception as e:
|
||
raise EvaluationBadRequestError(
|
||
"已保存的回放评分格式无效,请重新跑一次完整流式评分。"
|
||
) from e
|
||
|
||
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
|
||
if not judge:
|
||
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
|
||
_rt_cmp_total, _rt_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
|
||
baseline_result = await judge.judge_conversation_result(
|
||
full_transcript=baseline_transcript
|
||
)
|
||
if not baseline_result.output:
|
||
err = baseline_result.error or "unknown error"
|
||
msg = f"基准整体打分失败:{err}"
|
||
errs = _strip_baseline_judge_errors(list(prev.get("errors") or []))
|
||
errs.append(msg)
|
||
return {
|
||
"ok": False,
|
||
"error": err,
|
||
"message": msg,
|
||
"baseline_judge": None,
|
||
"replay_judge": raw_replay,
|
||
"compare_summary": build_conversation_compare_summary(
|
||
baseline_judge=None,
|
||
replay_judge=replay_model,
|
||
baseline_transcript=baseline_transcript,
|
||
replay_transcript=replay_transcript,
|
||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||
judge._ctx_tokens
|
||
),
|
||
compare_cap_total=_rt_cmp_total,
|
||
compare_per_side_cap=_rt_cmp_per,
|
||
fixture_filename=fn,
|
||
),
|
||
"compare_markdown": "",
|
||
"baseline_turn_judges": {},
|
||
"errors": errs,
|
||
}
|
||
|
||
baseline_judge = baseline_result.output
|
||
acc: dict[str, Any] = copy.deepcopy(prev)
|
||
acc.setdefault("version", 1)
|
||
acc["baseline_judge"] = baseline_judge.model_dump()
|
||
acc["fixture_filename"] = fn
|
||
acc["errors"] = _strip_baseline_judge_errors(list(acc.get("errors") or []))
|
||
opts = acc.setdefault("options", {})
|
||
if isinstance(opts, dict):
|
||
opts["judge_provider"] = judge_provider
|
||
opts["judge_model"] = resolved_model
|
||
|
||
if include_baseline_turn_judges and export_turns:
|
||
acc["baseline_turn_judges"] = {}
|
||
async for row in _iter_turn_judgments_for_turns(
|
||
judge,
|
||
export_turns,
|
||
sse_event="baseline_turn_judge",
|
||
):
|
||
idx = row.get("turn_index")
|
||
if isinstance(idx, (int, float)) and row.get("judge") is not None:
|
||
acc["baseline_turn_judges"][str(int(idx))] = row["judge"]
|
||
|
||
acc["compare_markdown"] = ""
|
||
acc["compare_summary"] = build_conversation_compare_summary(
|
||
baseline_judge=baseline_judge,
|
||
replay_judge=replay_model,
|
||
baseline_transcript=baseline_transcript,
|
||
replay_transcript=replay_transcript,
|
||
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
|
||
judge._ctx_tokens
|
||
),
|
||
compare_cap_total=_rt_cmp_total,
|
||
compare_per_side_cap=_rt_cmp_per,
|
||
fixture_filename=fn,
|
||
)
|
||
async for piece in judge.stream_conversation_compare(
|
||
baseline_transcript=baseline_transcript,
|
||
replay_transcript=replay_transcript,
|
||
baseline_judge=baseline_judge,
|
||
replay_judge=replay_model,
|
||
):
|
||
if piece:
|
||
acc["compare_markdown"] += piece
|
||
|
||
acc["judged_at"] = datetime.now(timezone.utc).isoformat()
|
||
await self._persist_playground_conversation_judge(cid, acc)
|
||
|
||
return {
|
||
"ok": True,
|
||
"error": None,
|
||
"message": None,
|
||
"baseline_judge": acc["baseline_judge"],
|
||
"replay_judge": acc.get("replay_judge"),
|
||
"compare_summary": acc.get("compare_summary"),
|
||
"compare_markdown": acc.get("compare_markdown") or "",
|
||
"baseline_turn_judges": acc.get("baseline_turn_judges") or {},
|
||
"errors": acc["errors"],
|
||
}
|
||
|
||
async def judge_memoir_for_user(
|
||
self,
|
||
user_id: str,
|
||
baseline_sections: list[MemoirSectionBaselineOut] | None,
|
||
*,
|
||
judge_provider: EvalJudgeProvider = "zhipu",
|
||
judge_model: str | None = None,
|
||
) -> dict[str, Any]:
|
||
uid = (user_id or "").strip()
|
||
if not uid:
|
||
raise EvaluationBadRequestError("user_id is required")
|
||
|
||
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
|
||
if not judge:
|
||
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
|
||
baselines = list(baseline_sections or [])
|
||
trace_svc = EvalTraceService(self._db)
|
||
|
||
def _chapter_evidence_notes(
|
||
lineage_tier: str,
|
||
evidence_summary: str,
|
||
truncated: bool,
|
||
dropped: list[str],
|
||
) -> str:
|
||
drops = ",".join(dropped[:12]) if dropped else ""
|
||
return (
|
||
"严格按文档打分;真实性、事实覆盖率、可追溯性以本章节绑定的证据闭包为准。"
|
||
f" lineage_tier={lineage_tier};evidence_summary={evidence_summary};"
|
||
f" prompt_truncated={truncated};dropped_sections={drops or 'none'}"
|
||
)
|
||
|
||
chapter_results: list[dict[str, Any]] = []
|
||
errors: list[str] = []
|
||
try:
|
||
chapters = await get_chapters_for_memoir_list(
|
||
uid, self._db, active_only=True, is_new_only=None
|
||
)
|
||
except Exception as e:
|
||
logger.exception("manual memoir: chapter list failed user_id={}", uid)
|
||
errors.append(f"加载章节列表失败:{e}")
|
||
chapters = []
|
||
|
||
def _nonempty_chapters(cols: list[Any]) -> int:
|
||
return sum(
|
||
1 for x in cols if (getattr(x, "canonical_markdown", None) or "").strip()
|
||
)
|
||
|
||
conc = max(1, min(32, int(settings.eval_judge_memoir_chapter_concurrency)))
|
||
logger.info(
|
||
"event=eval_memoir_judge_start user_id={} judge_provider={} judge_model={} "
|
||
"chapters_total={} chapters_nonempty={} chapter_concurrency={}",
|
||
uid,
|
||
judge_provider,
|
||
resolved_model or "",
|
||
len(chapters),
|
||
_nonempty_chapters(chapters[:_MAX_EVAL_CHAPTERS]),
|
||
conc,
|
||
)
|
||
|
||
prepared: list[dict[str, Any]] = []
|
||
enum_idx = 0
|
||
for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
|
||
body = (ch.canonical_markdown or "").strip()
|
||
if not body:
|
||
continue
|
||
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
|
||
baseline_excerpt = ""
|
||
if bl and (bl.body or "").strip():
|
||
baseline_excerpt = _clip_md_for_judge(
|
||
bl.body,
|
||
max_chars=max(
|
||
1000, int(settings.eval_judge_memoir_evidence_max_chars)
|
||
),
|
||
)
|
||
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
||
try:
|
||
cb = await trace_svc.build_chapter_bundle(uid, ch)
|
||
formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
|
||
fm = formatted.format_meta
|
||
prepared.append(
|
||
{
|
||
"enum_idx": enum_idx,
|
||
"ch": ch,
|
||
"bl": bl,
|
||
"md": md,
|
||
"baseline_excerpt": baseline_excerpt,
|
||
"formatted": formatted,
|
||
"cb2": cb2,
|
||
"fm": fm,
|
||
}
|
||
)
|
||
enum_idx += 1
|
||
except Exception as e:
|
||
logger.exception(
|
||
"manual memoir: chapter prepare failed user_id={} chapter_id={}",
|
||
uid,
|
||
ch.id,
|
||
)
|
||
label = str(ch.title or ch.id)
|
||
errors.append(f"章节「{label}」证据打包失败:{e}")
|
||
|
||
sem = asyncio.Semaphore(conc)
|
||
|
||
async def _judge_one(payload: dict[str, Any]) -> dict[str, Any]:
|
||
async with sem:
|
||
ch = payload["ch"]
|
||
formatted = payload["formatted"]
|
||
cb2 = payload["cb2"]
|
||
fm = payload["fm"]
|
||
baseline_excerpt = payload["baseline_excerpt"]
|
||
md = payload["md"]
|
||
bl = payload["bl"]
|
||
ch_label = str(ch.title or ch.id)
|
||
row_errs: list[str] = []
|
||
try:
|
||
cj_res = await judge.judge_memoir_result(
|
||
memoir_markdown=md,
|
||
source_transcript=formatted.source_transcript,
|
||
structured_evidence=formatted.structured_evidence,
|
||
reference_memoir_markdown=baseline_excerpt,
|
||
evidence_notes=_chapter_evidence_notes(
|
||
cb2.lineage_tier,
|
||
formatted.evidence_summary,
|
||
fm.truncated,
|
||
fm.dropped_sections,
|
||
),
|
||
)
|
||
except Exception as e:
|
||
logger.exception(
|
||
"manual memoir: chapter judge failed user_id={} chapter_id={}",
|
||
uid,
|
||
ch.id,
|
||
)
|
||
return {
|
||
"enum_idx": payload["enum_idx"],
|
||
"order_index": ch.order_index,
|
||
"row": None,
|
||
"errors": [
|
||
*row_errs,
|
||
f"章节「{ch_label}」评审失败:{e}",
|
||
],
|
||
}
|
||
cj = cj_res.output
|
||
row: dict[str, Any] = {
|
||
"id": ch.id,
|
||
"title": ch.title,
|
||
"order_index": ch.order_index,
|
||
"baseline_title": bl.title if bl else None,
|
||
"lineage_tier": cb2.lineage_tier,
|
||
"evidence_summary": formatted.evidence_summary,
|
||
"evidence_trace": cb2.model_dump(),
|
||
"format_meta": fm.model_dump(),
|
||
"judge": cj.model_dump() if cj else None,
|
||
}
|
||
if cj_res.error:
|
||
row["judge_error"] = cj_res.error
|
||
row_errs.append(f"章节「{ch_label}」LLM 评审失败:{cj_res.error}")
|
||
logger.info(
|
||
"event=eval_memoir_chapter_judge_failed user_id={} chapter_id={} msg={}",
|
||
uid,
|
||
ch.id,
|
||
cj_res.error,
|
||
)
|
||
elif not cj:
|
||
row["judge_error"] = "empty_output"
|
||
row_errs.append(f"章节「{ch_label}」评审返回空结果")
|
||
logger.info(
|
||
"event=eval_memoir_chapter_judge_empty user_id={} chapter_id={}",
|
||
uid,
|
||
ch.id,
|
||
)
|
||
return {
|
||
"enum_idx": payload["enum_idx"],
|
||
"order_index": ch.order_index,
|
||
"row": row,
|
||
"errors": row_errs,
|
||
}
|
||
|
||
judged = await asyncio.gather(*[_judge_one(p) for p in prepared])
|
||
judged.sort(
|
||
key=lambda r: (
|
||
r["order_index"]
|
||
if r["order_index"] is not None
|
||
else 10**9,
|
||
r["enum_idx"],
|
||
)
|
||
)
|
||
for r in judged:
|
||
errors.extend(r["errors"])
|
||
if r["row"] is not None:
|
||
chapter_results.append(r["row"])
|
||
|
||
story_results: list[dict[str, Any]] = []
|
||
|
||
warnings: list[str] = []
|
||
if not chapter_results and not errors:
|
||
warnings.append(
|
||
"未发现可评分的回忆录章节。请确认该用户存在 active 章节且 "
|
||
"canonical_markdown 非空;需要与导出对照时请加载带章节的 user export 作为基线。"
|
||
)
|
||
|
||
logger.info(
|
||
"event=eval_memoir_judge_done user_id={} chapter_rows={} story_rows={} "
|
||
"errors={} warnings={}",
|
||
uid,
|
||
len(chapter_results),
|
||
0,
|
||
len(errors),
|
||
len(warnings),
|
||
)
|
||
|
||
return {
|
||
"user_id": uid,
|
||
"judge_provider": judge_provider,
|
||
"judge_model": resolved_model or "",
|
||
"chapter_results": chapter_results,
|
||
"story_results": story_results,
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
}
|
||
|
||
async def iter_memoir_chapter_judge_sse(
|
||
self,
|
||
user_id: str,
|
||
baseline_sections: list[MemoirSectionBaselineOut] | None,
|
||
*,
|
||
judge_provider: EvalJudgeProvider = "zhipu",
|
||
judge_model: str | None = None,
|
||
) -> AsyncIterator[dict[str, Any]]:
|
||
"""Streaming SSE: one event per chapter judge result, concurrent LLM calls."""
|
||
uid = (user_id or "").strip()
|
||
if not uid:
|
||
yield {"event": "error", "phase": "validate", "message": "user_id is required"}
|
||
return
|
||
|
||
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
|
||
if not judge:
|
||
yield {"event": "error", "phase": "config", "message": _JUDGE_CONFIG_HINT}
|
||
return
|
||
|
||
baselines = list(baseline_sections or [])
|
||
trace_svc = EvalTraceService(self._db)
|
||
|
||
def _chapter_evidence_notes(
|
||
lineage_tier: str, evidence_summary: str, truncated: bool, dropped: list[str]
|
||
) -> str:
|
||
drops = ",".join(dropped[:12]) if dropped else ""
|
||
return (
|
||
"严格按文档打分;真实性、事实覆盖率、可追溯性以本章节绑定的证据闭包为准。"
|
||
f" lineage_tier={lineage_tier};evidence_summary={evidence_summary};"
|
||
f" prompt_truncated={truncated};dropped_sections={drops or 'none'}"
|
||
)
|
||
|
||
try:
|
||
chapters = await get_chapters_for_memoir_list(
|
||
uid, self._db, active_only=True, is_new_only=None
|
||
)
|
||
except Exception as e:
|
||
logger.exception("manual memoir stream: chapter list failed user_id={}", uid)
|
||
yield {"event": "error", "phase": "load", "message": f"加载章节列表失败:{e}"}
|
||
return
|
||
|
||
yield {
|
||
"event": "meta",
|
||
"user_id": uid,
|
||
"judge_provider": judge_provider,
|
||
"judge_model": resolved_model or "",
|
||
"total_chapters": len(chapters),
|
||
}
|
||
|
||
prepared: list[dict[str, Any]] = []
|
||
for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
|
||
body = (ch.canonical_markdown or "").strip()
|
||
if not body:
|
||
continue
|
||
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
|
||
baseline_excerpt = ""
|
||
if bl and (bl.body or "").strip():
|
||
baseline_excerpt = _clip_md_for_judge(
|
||
bl.body,
|
||
max_chars=max(
|
||
1000, int(settings.eval_judge_memoir_evidence_max_chars)
|
||
),
|
||
)
|
||
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
||
try:
|
||
cb = await trace_svc.build_chapter_bundle(uid, ch)
|
||
formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
|
||
fm = formatted.format_meta
|
||
prepared.append({
|
||
"ch": ch, "bl": bl, "md": md,
|
||
"baseline_excerpt": baseline_excerpt,
|
||
"formatted": formatted, "cb2": cb2, "fm": fm,
|
||
})
|
||
except Exception as e:
|
||
logger.exception(
|
||
"manual memoir stream: chapter prepare failed user_id={} chapter_id={}",
|
||
uid, ch.id,
|
||
)
|
||
yield {
|
||
"event": "chapter_error",
|
||
"chapter_id": ch.id,
|
||
"title": ch.title,
|
||
"message": f"证据打包失败:{e}",
|
||
}
|
||
|
||
if not prepared:
|
||
yield {
|
||
"event": "warning",
|
||
"message": "未发现可评分的回忆录章节(成稿为空或无 active 章节)。",
|
||
}
|
||
|
||
yield {"event": "chapters_prepared", "count": len(prepared)}
|
||
|
||
conc = max(1, min(32, int(settings.eval_judge_memoir_chapter_concurrency)))
|
||
sem = asyncio.Semaphore(conc)
|
||
result_queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue()
|
||
|
||
async def _judge_one(idx: int, payload: dict[str, Any]) -> None:
|
||
async with sem:
|
||
ch = payload["ch"]
|
||
formatted = payload["formatted"]
|
||
cb2 = payload["cb2"]
|
||
fm = payload["fm"]
|
||
baseline_excerpt = payload["baseline_excerpt"]
|
||
md = payload["md"]
|
||
bl = payload["bl"]
|
||
ev_notes = _chapter_evidence_notes(
|
||
cb2.lineage_tier,
|
||
formatted.evidence_summary,
|
||
fm.truncated,
|
||
fm.dropped_sections,
|
||
)
|
||
|
||
baseline_judge_obj = None
|
||
baseline_judge_dict = None
|
||
baseline_error: str | None = None
|
||
if baseline_excerpt:
|
||
try:
|
||
bl_md = f"# 章节:{bl.title if bl else ch.title}\n\n{baseline_excerpt}"
|
||
bl_res = await judge.judge_memoir_result(
|
||
memoir_markdown=bl_md,
|
||
source_transcript=formatted.source_transcript,
|
||
structured_evidence=formatted.structured_evidence,
|
||
evidence_notes=ev_notes,
|
||
)
|
||
baseline_judge_obj = bl_res.output
|
||
baseline_judge_dict = (
|
||
baseline_judge_obj.model_dump()
|
||
if baseline_judge_obj
|
||
else None
|
||
)
|
||
if bl_res.error:
|
||
baseline_error = bl_res.error
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"memoir stream: baseline judge failed ch={} err={}",
|
||
ch.id, exc,
|
||
)
|
||
baseline_error = str(exc)
|
||
|
||
try:
|
||
cj_res = await judge.judge_memoir_result(
|
||
memoir_markdown=md,
|
||
source_transcript=formatted.source_transcript,
|
||
structured_evidence=formatted.structured_evidence,
|
||
reference_memoir_markdown=baseline_excerpt,
|
||
evidence_notes=ev_notes,
|
||
)
|
||
except Exception as e:
|
||
logger.exception(
|
||
"manual memoir stream: chapter judge failed user_id={} chapter_id={}",
|
||
uid, ch.id,
|
||
)
|
||
await result_queue.put({
|
||
"event": "chapter_error",
|
||
"chapter_id": ch.id,
|
||
"title": ch.title,
|
||
"message": f"评审失败:{e}",
|
||
})
|
||
return
|
||
cj = cj_res.output
|
||
compare_summary = build_memoir_compare_summary(
|
||
baseline_judge=baseline_judge_obj,
|
||
chapter_judge=cj,
|
||
)
|
||
row: dict[str, Any] = {
|
||
"id": ch.id,
|
||
"title": ch.title,
|
||
"order_index": ch.order_index,
|
||
"baseline_title": bl.title if bl else None,
|
||
"lineage_tier": cb2.lineage_tier,
|
||
"evidence_summary": formatted.evidence_summary,
|
||
"evidence_trace": cb2.model_dump(),
|
||
"format_meta": fm.model_dump(),
|
||
"baseline_judge": baseline_judge_dict,
|
||
"judge": cj.model_dump() if cj else None,
|
||
"compare_summary": compare_summary,
|
||
}
|
||
if baseline_error:
|
||
row["baseline_judge_error"] = baseline_error
|
||
if cj_res.error:
|
||
row["judge_error"] = cj_res.error
|
||
if not cj and not cj_res.error:
|
||
row["judge_error"] = "empty_output"
|
||
await result_queue.put({
|
||
"event": "chapter_judge",
|
||
"index": idx,
|
||
"chapter": row,
|
||
"ok": cj is not None,
|
||
})
|
||
|
||
tasks = [asyncio.create_task(_judge_one(i, p)) for i, p in enumerate(prepared)]
|
||
|
||
finished = 0
|
||
total = len(tasks)
|
||
while finished < total:
|
||
item = await result_queue.get()
|
||
if item is not None:
|
||
yield item
|
||
finished_now = sum(1 for t in tasks if t.done())
|
||
if finished_now > finished:
|
||
finished = finished_now
|
||
|
||
for t in tasks:
|
||
await t
|
||
|
||
while not result_queue.empty():
|
||
item = result_queue.get_nowait()
|
||
if item is not None:
|
||
yield item
|
||
|
||
yield {"event": "done"}
|
||
|
||
async def memoir_snapshot(self, user_id: str) -> dict[str, Any]:
|
||
uid = (user_id or "").strip()
|
||
if not uid:
|
||
raise EvaluationBadRequestError("user_id is required")
|
||
|
||
chapters_out: list[dict[str, Any]] = []
|
||
stories_out: list[dict[str, Any]] = []
|
||
try:
|
||
chapters = await get_chapters_for_memoir_list(
|
||
uid, self._db, active_only=True, is_new_only=None
|
||
)
|
||
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
|
||
chapters_out.append(
|
||
{
|
||
"id": ch.id,
|
||
"title": ch.title,
|
||
"category": ch.category,
|
||
"order_index": ch.order_index,
|
||
"canonical_markdown": ch.canonical_markdown,
|
||
}
|
||
)
|
||
except Exception as e:
|
||
logger.warning("memoir snapshot chapters failed: {}", e)
|
||
try:
|
||
stories = await get_stories_for_user(self._db, uid, status="active")
|
||
for st in stories[:_MAX_EVAL_STORIES]:
|
||
stories_out.append(
|
||
{
|
||
"id": st.id,
|
||
"title": st.title,
|
||
"stage": st.stage,
|
||
"canonical_markdown": st.canonical_markdown,
|
||
}
|
||
)
|
||
except Exception as e:
|
||
logger.warning("memoir snapshot stories failed: {}", e)
|
||
|
||
return {
|
||
"user_id": uid,
|
||
"chapters": chapters_out,
|
||
"stories": stories_out,
|
||
}
|