Files
life-echo/api/app/features/evaluation/judge_manual_service.py
yangshilin 17b9fa3466 fix:
1. 修复登录界面文字被遮挡问题
2. 大字模式关闭后显示异常问题
3. 重新调整大字模式是否开启时的字体显示效果
2026-04-10 20:35:57 +08:00

1173 lines
45 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""手动触发评测台评审(智谱 / DeepSeek不写 eval_runsPlayground 对话评分写入 conversations 表)。"""
from __future__ import annotations
import asyncio
import copy
import re
from collections.abc import AsyncIterator
from datetime import datetime, timezone
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings
from app.core.dependencies import (
EvalJudgeProvider,
build_eval_judge_llm_spec,
)
from app.core.logging import get_logger
from app.features.conversation import repo as conversation_repo
from app.features.evaluation.conversation_compare_summary import (
build_conversation_compare_summary,
)
from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
)
from app.features.evaluation.eval_trace_service import EvalTraceService
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import (
EvalJudgeService,
eval_judge_compare_bundle_caps,
eval_judge_conversation_transcript_max_chars_for_context,
)
from app.features.evaluation.memoir_compare_summary import (
build_memoir_compare_summary,
)
from app.features.evaluation.schemas import MemoirSectionBaselineOut
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.transcript_for_judge import (
assistant_text_for_eval_display,
format_eval_turn_block,
format_export_turns_with_labels,
format_session_messages_with_turn_labels,
pair_session_messages_to_turns,
)
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user
logger = get_logger(__name__)
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40 # memoir_snapshot 等仍限幅
_PRIOR_TRANSCRIPT_MAX_CHARS = 8000
_JUDGE_CONFIG_HINT = (
"评审未配置:智谱需 eval_judge_api_key 或 zhipu_api_key"
"DeepSeek 需 deepseek_api_key或 llm_api_key"
)
def _make_eval_judge(
judge_provider: EvalJudgeProvider,
judge_model: str | None,
) -> tuple[EvalJudgeService | None, str]:
spec = build_eval_judge_llm_spec(judge_provider, judge_model)
if not spec or not spec.llm:
return None, ""
return (
EvalJudgeService(
spec.llm,
context_window_tokens=spec.context_window_tokens,
),
spec.resolved_model,
)
def _strip_baseline_judge_errors(errs: list[Any]) -> list[str]:
out: list[str] = []
for e in errs:
s = str(e) if e is not None else ""
if not s.strip():
continue
if (
"基准整体打分失败" in s
or s.startswith("baseline_glm5:")
or "baseline_glm5_failed:" in s
):
continue
out.append(s)
return out
async def _iter_turn_judgments_for_turns(
judge: EvalJudgeService,
turns: list[tuple[str, str]],
*,
sse_event: str,
) -> AsyncIterator[dict[str, Any]]:
"""与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。"""
prior_blocks: list[str] = []
for idx, (u_raw, ai_raw) in enumerate(turns):
u = (u_raw or "").strip()
reply = assistant_text_for_eval_display(str(ai_raw))
prior = "\n\n".join(prior_blocks)
if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS:
prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:]
tj = await judge.judge_turn(
prior_transcript=prior,
user_utterance=u,
assistant_reply=reply,
turn_index_0=idx,
)
yield {
"event": sse_event,
"turn_index": idx,
"ok": tj is not None,
"judge": tj.model_dump() if tj else None,
}
prior_blocks.append(format_eval_turn_block(idx, u, reply))
def _clip_md_for_judge(text: str, max_chars: int | None = None) -> str:
cap = (
max_chars
if max_chars is not None
else max(1000, int(settings.eval_judge_memoir_body_max_chars))
)
s = (text or "").strip()
if len(s) <= cap:
return s
return f"{s[:cap]}\n\n…(已截断供评审)"
async def _conversation_transcript_for_manual(
db: AsyncSession, conversation_id: str
) -> str:
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
return format_session_messages_with_turn_labels(rows)
def _normalize_title_key(title: str) -> str:
t = (title or "").strip().lower()
t = re.sub(r"^#+\s*", "", t)
return re.sub(r"\s+", " ", t)
def _baseline_for_chapter_title(
baselines: list[MemoirSectionBaselineOut],
chapter_title: str,
index: int,
) -> MemoirSectionBaselineOut | None:
if baselines:
key = _normalize_title_key(chapter_title)
for b in baselines:
if _normalize_title_key(b.title) == key:
return b
if 0 <= index < len(baselines):
return baselines[index]
return None
class EvalJudgeManualService:
def __init__(self, db: AsyncSession) -> None:
self._db = db
async def _persist_playground_conversation_judge(
self, conversation_id: str, bundle: dict[str, Any]
) -> None:
try:
row = await conversation_repo.set_playground_conversation_judge_json(
conversation_id, self._db, bundle
)
if row is not None:
await self._db.commit()
except Exception:
logger.exception(
"persist playground_conversation_judge_json failed conversation_id={}",
conversation_id,
)
async def judge_conversation(
self,
conversation_id: str,
fixture_filename: str | None,
*,
judge_provider: EvalJudgeProvider = "zhipu",
judge_model: str | None = None,
) -> dict[str, Any]:
cid = (conversation_id or "").strip()
if not cid:
raise EvaluationBadRequestError("conversation_id is required")
catalog = SessionCatalogService(self._db)
dialogue = await catalog.get_session_dialogue(cid)
if not dialogue:
raise EvaluationNotFoundError("conversation not found")
replay_transcript = format_session_messages_with_turn_labels(
list(dialogue.messages)
)
if not replay_transcript.strip():
raise EvaluationBadRequestError("no messages to judge")
fn = (fixture_filename or "").strip() or None
baseline_transcript = ""
if fn:
try:
turns, _ = read_user_export_fixture(fn)
baseline_transcript = format_export_turns_with_labels(turns)
except ValueError as e:
raise EvaluationBadRequestError(str(e)) from e
except FileNotFoundError as e:
raise EvaluationNotFoundError("fixture not found") from e
errors: list[str] = []
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
if not judge:
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
baseline_judge_dict: dict[str, Any] | None = None
if baseline_transcript.strip():
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
bj = baseline_result.output
if bj:
baseline_judge_dict = bj.model_dump()
else:
errors.append(
f"baseline_glm5_failed: {baseline_result.error or 'unknown error'}"
)
elif fn:
errors.append("baseline_transcript_empty")
replay_result = await judge.judge_conversation_result(
full_transcript=replay_transcript
)
rj = replay_result.output
replay_judge_dict = rj.model_dump() if rj else None
if not rj:
errors.append(
f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
)
_cmp_total, _cmp_per_side = eval_judge_compare_bundle_caps(judge._ctx_tokens)
bundle: dict[str, Any] = {
"version": 1,
"judged_at": datetime.now(timezone.utc).isoformat(),
"fixture_filename": fn,
"baseline_judge": baseline_judge_dict,
"replay_judge": replay_judge_dict,
"baseline_turn_judges": {},
"replay_turn_judges": {},
"compare_summary": build_conversation_compare_summary(
baseline_judge=bj,
replay_judge=rj,
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_cmp_total,
compare_per_side_cap=_cmp_per_side,
fixture_filename=fn,
),
"compare_markdown": "",
"errors": list(errors),
"warnings": [],
"options": {
"include_turn_judges": False,
"include_baseline_turn_judges": False,
"judge_provider": judge_provider,
"judge_model": resolved_model,
},
}
await self._persist_playground_conversation_judge(cid, bundle)
return {
"conversation_id": cid,
"fixture_filename": fn,
"baseline_transcript": baseline_transcript,
"replay_transcript": replay_transcript,
"baseline_judge": baseline_judge_dict,
"replay_judge": replay_judge_dict,
"compare_summary": bundle.get("compare_summary"),
"errors": errors,
}
async def iter_conversation_judge_sse(
self,
conversation_id: str,
fixture_filename: str | None,
*,
include_turn_judges: bool = False,
include_baseline_turn_judges: bool = False,
judge_provider: EvalJudgeProvider = "zhipu",
judge_model: str | None = None,
) -> AsyncIterator[dict[str, Any]]:
"""供 SSE先整体基准分、再整体回放分可选逐轮分再流式对比与建议成功后写入 playground 字段。"""
acc: dict[str, Any] = {
"version": 1,
"fixture_filename": None,
"baseline_judge": None,
"replay_judge": None,
"baseline_turn_judges": {},
"replay_turn_judges": {},
"compare_summary": None,
"compare_markdown": "",
"errors": [],
"warnings": [],
"options": {
"include_turn_judges": include_turn_judges,
"include_baseline_turn_judges": include_baseline_turn_judges,
"judge_provider": judge_provider,
"judge_model": "",
},
}
cid = (conversation_id or "").strip()
if not cid:
yield {
"event": "error",
"phase": "validate",
"message": "conversation_id is required",
}
return
catalog = SessionCatalogService(self._db)
dialogue = await catalog.get_session_dialogue(cid)
if not dialogue:
yield {
"event": "error",
"phase": "load",
"message": "conversation not found",
}
return
replay_transcript = format_session_messages_with_turn_labels(
list(dialogue.messages)
)
if not replay_transcript.strip():
yield {"event": "error", "phase": "load", "message": "no messages to judge"}
return
fn = (fixture_filename or "").strip() or None
baseline_transcript = ""
export_turns: list[tuple[str, str]] | None = None
if fn:
try:
turns, _ = read_user_export_fixture(fn)
export_turns = list(turns)
baseline_transcript = format_export_turns_with_labels(turns)
except ValueError as e:
yield {"event": "error", "phase": "fixture", "message": str(e)}
return
except FileNotFoundError:
yield {
"event": "error",
"phase": "fixture",
"message": "fixture not found",
}
return
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
if not judge:
yield {
"event": "error",
"phase": "config",
"message": _JUDGE_CONFIG_HINT,
}
return
acc["options"]["judge_model"] = resolved_model
acc["fixture_filename"] = fn
_sse_cmp_total, _sse_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
persist = True
try:
yield {
"event": "meta",
"conversation_id": cid,
"fixture_filename": fn,
"judge_provider": judge_provider,
"judge_model": resolved_model,
}
if not baseline_transcript.strip():
wmsg = "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议"
acc["warnings"].append(wmsg)
yield {"event": "warning", "message": wmsg}
baseline_judge = None
if baseline_transcript.strip():
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
baseline_judge = baseline_result.output
acc["baseline_judge"] = (
baseline_judge.model_dump() if baseline_judge else None
)
yield {
"event": "baseline_judge",
"ok": baseline_judge is not None,
"judge": acc["baseline_judge"],
}
if not baseline_judge:
err = (
f"基准整体打分失败:{baseline_result.error}"
if baseline_result.error
else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)"
)
acc["errors"].append(err)
yield {
"event": "error",
"phase": "baseline_glm5",
"message": err,
}
elif (
include_baseline_turn_judges
and export_turns
and baseline_judge is not None
):
yield {"event": "meta", "phase": "baseline_turn_judges_start"}
async for row in _iter_turn_judgments_for_turns(
judge,
export_turns,
sse_event="baseline_turn_judge",
):
if row.get("event") == "baseline_turn_judge":
idx = row.get("turn_index")
if isinstance(idx, (int, float)):
acc["baseline_turn_judges"][str(int(idx))] = row.get(
"judge"
)
yield row
else:
acc["baseline_judge"] = None
yield {
"event": "baseline_judge",
"ok": False,
"skipped": True,
"judge": None,
}
replay_result = await judge.judge_conversation_result(
full_transcript=replay_transcript
)
replay_judge = replay_result.output
acc["replay_judge"] = replay_judge.model_dump() if replay_judge else None
acc["compare_summary"] = build_conversation_compare_summary(
baseline_judge=baseline_judge,
replay_judge=replay_judge,
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_sse_cmp_total,
compare_per_side_cap=_sse_cmp_per,
fixture_filename=fn,
)
yield {
"event": "replay_judge",
"ok": replay_judge is not None,
"judge": acc["replay_judge"],
}
yield {"event": "compare_summary", "summary": acc["compare_summary"]}
if not replay_judge:
err = (
f"回放对话整体打分失败:{replay_result.error}"
if replay_result.error
else "回放对话整体打分失败(限流或 JSON 解析失败,见服务端日志)"
)
acc["errors"].append(err)
yield {
"event": "error",
"phase": "replay_glm5",
"message": err,
}
yield {"event": "done"}
return
if include_turn_judges:
replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
if replay_pairs:
yield {"event": "meta", "phase": "replay_turn_judges_start"}
async for row in _iter_turn_judgments_for_turns(
judge,
replay_pairs,
sse_event="replay_turn_judge",
):
if row.get("event") == "replay_turn_judge":
idx = row.get("turn_index")
if isinstance(idx, (int, float)):
acc["replay_turn_judges"][str(int(idx))] = row.get(
"judge"
)
yield row
async for piece in judge.stream_conversation_compare(
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
baseline_judge=baseline_judge,
replay_judge=replay_judge,
):
if piece:
acc["compare_markdown"] += piece
yield {"event": "compare_delta", "text": piece}
yield {"event": "done"}
finally:
if persist:
acc["judged_at"] = datetime.now(timezone.utc).isoformat()
await self._persist_playground_conversation_judge(cid, acc)
async def retry_baseline_conversation_judge(
self,
conversation_id: str,
fixture_filename: str | None,
*,
include_baseline_turn_judges: bool = False,
judge_provider: EvalJudgeProvider = "zhipu",
judge_model: str | None = None,
) -> dict[str, Any]:
"""仅重试导出基线整体 GLM 分(及可选基线逐轮),并基于已有 replay 分重新生成对比稿。"""
cid = (conversation_id or "").strip()
if not cid:
raise EvaluationBadRequestError("conversation_id is required")
catalog = SessionCatalogService(self._db)
dialogue = await catalog.get_session_dialogue(cid)
if not dialogue:
raise EvaluationNotFoundError("conversation not found")
replay_transcript = format_session_messages_with_turn_labels(
list(dialogue.messages)
)
if not replay_transcript.strip():
raise EvaluationBadRequestError("no messages to judge")
fn = (fixture_filename or "").strip() or None
if not fn:
raise EvaluationBadRequestError(
"请选择基线 MDfixture_filename后再重试基准分"
)
try:
turns, _ = read_user_export_fixture(fn)
export_turns = list(turns)
baseline_transcript = format_export_turns_with_labels(turns)
except ValueError as e:
raise EvaluationBadRequestError(str(e)) from e
except FileNotFoundError:
raise EvaluationNotFoundError("fixture not found") from None
if not baseline_transcript.strip():
raise EvaluationBadRequestError("baseline transcript is empty")
prev = await catalog.get_playground_conversation_judge_json(cid)
if not prev or not isinstance(prev, dict):
raise EvaluationBadRequestError(
"服务端没有已保存的评分草稿:请先跑一次「自动评分(流式)」"
"直到回放侧打分完成,再使用本重试。"
)
raw_replay = prev.get("replay_judge")
if not raw_replay or not isinstance(raw_replay, dict):
raise EvaluationBadRequestError(
"已保存结果中缺少回放侧整体分:请先完成流式评分中的回放打分阶段再重试基准。"
)
try:
replay_model = ConversationJudgeOutput.model_validate(raw_replay)
except Exception as e:
raise EvaluationBadRequestError(
"已保存的回放评分格式无效,请重新跑一次完整流式评分。"
) from e
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
if not judge:
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
_rt_cmp_total, _rt_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
if not baseline_result.output:
err = baseline_result.error or "unknown error"
msg = f"基准整体打分失败:{err}"
errs = _strip_baseline_judge_errors(list(prev.get("errors") or []))
errs.append(msg)
return {
"ok": False,
"error": err,
"message": msg,
"baseline_judge": None,
"replay_judge": raw_replay,
"compare_summary": build_conversation_compare_summary(
baseline_judge=None,
replay_judge=replay_model,
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_rt_cmp_total,
compare_per_side_cap=_rt_cmp_per,
fixture_filename=fn,
),
"compare_markdown": "",
"baseline_turn_judges": {},
"errors": errs,
}
baseline_judge = baseline_result.output
acc: dict[str, Any] = copy.deepcopy(prev)
acc.setdefault("version", 1)
acc["baseline_judge"] = baseline_judge.model_dump()
acc["fixture_filename"] = fn
acc["errors"] = _strip_baseline_judge_errors(list(acc.get("errors") or []))
opts = acc.setdefault("options", {})
if isinstance(opts, dict):
opts["judge_provider"] = judge_provider
opts["judge_model"] = resolved_model
if include_baseline_turn_judges and export_turns:
acc["baseline_turn_judges"] = {}
async for row in _iter_turn_judgments_for_turns(
judge,
export_turns,
sse_event="baseline_turn_judge",
):
idx = row.get("turn_index")
if isinstance(idx, (int, float)) and row.get("judge") is not None:
acc["baseline_turn_judges"][str(int(idx))] = row["judge"]
acc["compare_markdown"] = ""
acc["compare_summary"] = build_conversation_compare_summary(
baseline_judge=baseline_judge,
replay_judge=replay_model,
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_rt_cmp_total,
compare_per_side_cap=_rt_cmp_per,
fixture_filename=fn,
)
async for piece in judge.stream_conversation_compare(
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
baseline_judge=baseline_judge,
replay_judge=replay_model,
):
if piece:
acc["compare_markdown"] += piece
acc["judged_at"] = datetime.now(timezone.utc).isoformat()
await self._persist_playground_conversation_judge(cid, acc)
return {
"ok": True,
"error": None,
"message": None,
"baseline_judge": acc["baseline_judge"],
"replay_judge": acc.get("replay_judge"),
"compare_summary": acc.get("compare_summary"),
"compare_markdown": acc.get("compare_markdown") or "",
"baseline_turn_judges": acc.get("baseline_turn_judges") or {},
"errors": acc["errors"],
}
async def judge_memoir_for_user(
self,
user_id: str,
baseline_sections: list[MemoirSectionBaselineOut] | None,
*,
judge_provider: EvalJudgeProvider = "zhipu",
judge_model: str | None = None,
) -> dict[str, Any]:
uid = (user_id or "").strip()
if not uid:
raise EvaluationBadRequestError("user_id is required")
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
if not judge:
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
baselines = list(baseline_sections or [])
trace_svc = EvalTraceService(self._db)
def _chapter_evidence_notes(
lineage_tier: str,
evidence_summary: str,
truncated: bool,
dropped: list[str],
) -> str:
drops = ",".join(dropped[:12]) if dropped else ""
return (
"严格按文档打分;真实性、事实覆盖率、可追溯性以本章节绑定的证据闭包为准。"
f" lineage_tier={lineage_tier}evidence_summary={evidence_summary}"
f" prompt_truncated={truncated}dropped_sections={drops or 'none'}"
)
chapter_results: list[dict[str, Any]] = []
errors: list[str] = []
try:
chapters = await get_chapters_for_memoir_list(
uid, self._db, active_only=True, is_new_only=None
)
except Exception as e:
logger.exception("manual memoir: chapter list failed user_id={}", uid)
errors.append(f"加载章节列表失败:{e}")
chapters = []
def _nonempty_chapters(cols: list[Any]) -> int:
return sum(
1
for x in cols
if (getattr(x, "canonical_markdown", None) or "").strip()
)
conc = max(1, min(32, int(settings.eval_judge_memoir_chapter_concurrency)))
logger.info(
"event=eval_memoir_judge_start user_id={} judge_provider={} judge_model={} "
"chapters_total={} chapters_nonempty={} chapter_concurrency={}",
uid,
judge_provider,
resolved_model or "",
len(chapters),
_nonempty_chapters(chapters[:_MAX_EVAL_CHAPTERS]),
conc,
)
prepared: list[dict[str, Any]] = []
enum_idx = 0
for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
body = (ch.canonical_markdown or "").strip()
if not body:
continue
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
baseline_excerpt = ""
if bl and (bl.body or "").strip():
baseline_excerpt = _clip_md_for_judge(
bl.body,
max_chars=max(
1000, int(settings.eval_judge_memoir_evidence_max_chars)
),
)
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
try:
cb = await trace_svc.build_chapter_bundle(uid, ch)
formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
fm = formatted.format_meta
prepared.append(
{
"enum_idx": enum_idx,
"ch": ch,
"bl": bl,
"md": md,
"baseline_excerpt": baseline_excerpt,
"formatted": formatted,
"cb2": cb2,
"fm": fm,
}
)
enum_idx += 1
except Exception as e:
logger.exception(
"manual memoir: chapter prepare failed user_id={} chapter_id={}",
uid,
ch.id,
)
label = str(ch.title or ch.id)
errors.append(f"章节「{label}」证据打包失败:{e}")
sem = asyncio.Semaphore(conc)
async def _judge_one(payload: dict[str, Any]) -> dict[str, Any]:
async with sem:
ch = payload["ch"]
formatted = payload["formatted"]
cb2 = payload["cb2"]
fm = payload["fm"]
baseline_excerpt = payload["baseline_excerpt"]
md = payload["md"]
bl = payload["bl"]
ch_label = str(ch.title or ch.id)
row_errs: list[str] = []
try:
cj_res = await judge.judge_memoir_result(
memoir_markdown=md,
source_transcript=formatted.source_transcript,
structured_evidence=formatted.structured_evidence,
reference_memoir_markdown=baseline_excerpt,
evidence_notes=_chapter_evidence_notes(
cb2.lineage_tier,
formatted.evidence_summary,
fm.truncated,
fm.dropped_sections,
),
)
except Exception as e:
logger.exception(
"manual memoir: chapter judge failed user_id={} chapter_id={}",
uid,
ch.id,
)
return {
"enum_idx": payload["enum_idx"],
"order_index": ch.order_index,
"row": None,
"errors": [
*row_errs,
f"章节「{ch_label}」评审失败:{e}",
],
}
cj = cj_res.output
row: dict[str, Any] = {
"id": ch.id,
"title": ch.title,
"order_index": ch.order_index,
"baseline_title": bl.title if bl else None,
"lineage_tier": cb2.lineage_tier,
"evidence_summary": formatted.evidence_summary,
"evidence_trace": cb2.model_dump(),
"format_meta": fm.model_dump(),
"judge": cj.model_dump() if cj else None,
}
if cj_res.error:
row["judge_error"] = cj_res.error
row_errs.append(f"章节「{ch_label}」LLM 评审失败:{cj_res.error}")
logger.info(
"event=eval_memoir_chapter_judge_failed user_id={} chapter_id={} msg={}",
uid,
ch.id,
cj_res.error,
)
elif not cj:
row["judge_error"] = "empty_output"
row_errs.append(f"章节「{ch_label}」评审返回空结果")
logger.info(
"event=eval_memoir_chapter_judge_empty user_id={} chapter_id={}",
uid,
ch.id,
)
return {
"enum_idx": payload["enum_idx"],
"order_index": ch.order_index,
"row": row,
"errors": row_errs,
}
judged = await asyncio.gather(*[_judge_one(p) for p in prepared])
judged.sort(
key=lambda r: (
r["order_index"] if r["order_index"] is not None else 10**9,
r["enum_idx"],
)
)
for r in judged:
errors.extend(r["errors"])
if r["row"] is not None:
chapter_results.append(r["row"])
story_results: list[dict[str, Any]] = []
warnings: list[str] = []
if not chapter_results and not errors:
warnings.append(
"未发现可评分的回忆录章节。请确认该用户存在 active 章节且 "
"canonical_markdown 非空;需要与导出对照时请加载带章节的 user export 作为基线。"
)
logger.info(
"event=eval_memoir_judge_done user_id={} chapter_rows={} story_rows={} "
"errors={} warnings={}",
uid,
len(chapter_results),
0,
len(errors),
len(warnings),
)
return {
"user_id": uid,
"judge_provider": judge_provider,
"judge_model": resolved_model or "",
"chapter_results": chapter_results,
"story_results": story_results,
"errors": errors,
"warnings": warnings,
}
async def iter_memoir_chapter_judge_sse(
self,
user_id: str,
baseline_sections: list[MemoirSectionBaselineOut] | None,
*,
judge_provider: EvalJudgeProvider = "zhipu",
judge_model: str | None = None,
) -> AsyncIterator[dict[str, Any]]:
"""Streaming SSE: one event per chapter judge result, concurrent LLM calls."""
uid = (user_id or "").strip()
if not uid:
yield {
"event": "error",
"phase": "validate",
"message": "user_id is required",
}
return
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
if not judge:
yield {"event": "error", "phase": "config", "message": _JUDGE_CONFIG_HINT}
return
baselines = list(baseline_sections or [])
trace_svc = EvalTraceService(self._db)
def _chapter_evidence_notes(
lineage_tier: str,
evidence_summary: str,
truncated: bool,
dropped: list[str],
) -> str:
drops = ",".join(dropped[:12]) if dropped else ""
return (
"严格按文档打分;真实性、事实覆盖率、可追溯性以本章节绑定的证据闭包为准。"
f" lineage_tier={lineage_tier}evidence_summary={evidence_summary}"
f" prompt_truncated={truncated}dropped_sections={drops or 'none'}"
)
try:
chapters = await get_chapters_for_memoir_list(
uid, self._db, active_only=True, is_new_only=None
)
except Exception as e:
logger.exception(
"manual memoir stream: chapter list failed user_id={}", uid
)
yield {
"event": "error",
"phase": "load",
"message": f"加载章节列表失败:{e}",
}
return
yield {
"event": "meta",
"user_id": uid,
"judge_provider": judge_provider,
"judge_model": resolved_model or "",
"total_chapters": len(chapters),
}
prepared: list[dict[str, Any]] = []
for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
body = (ch.canonical_markdown or "").strip()
if not body:
continue
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
baseline_excerpt = ""
if bl and (bl.body or "").strip():
baseline_excerpt = _clip_md_for_judge(
bl.body,
max_chars=max(
1000, int(settings.eval_judge_memoir_evidence_max_chars)
),
)
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
try:
cb = await trace_svc.build_chapter_bundle(uid, ch)
formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
fm = formatted.format_meta
prepared.append(
{
"ch": ch,
"bl": bl,
"md": md,
"baseline_excerpt": baseline_excerpt,
"formatted": formatted,
"cb2": cb2,
"fm": fm,
}
)
except Exception as e:
logger.exception(
"manual memoir stream: chapter prepare failed user_id={} chapter_id={}",
uid,
ch.id,
)
yield {
"event": "chapter_error",
"chapter_id": ch.id,
"title": ch.title,
"message": f"证据打包失败:{e}",
}
if not prepared:
yield {
"event": "warning",
"message": "未发现可评分的回忆录章节(成稿为空或无 active 章节)。",
}
yield {"event": "chapters_prepared", "count": len(prepared)}
conc = max(1, min(32, int(settings.eval_judge_memoir_chapter_concurrency)))
sem = asyncio.Semaphore(conc)
result_queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue()
async def _judge_one(idx: int, payload: dict[str, Any]) -> None:
async with sem:
ch = payload["ch"]
formatted = payload["formatted"]
cb2 = payload["cb2"]
fm = payload["fm"]
baseline_excerpt = payload["baseline_excerpt"]
md = payload["md"]
bl = payload["bl"]
ev_notes = _chapter_evidence_notes(
cb2.lineage_tier,
formatted.evidence_summary,
fm.truncated,
fm.dropped_sections,
)
baseline_judge_obj = None
baseline_judge_dict = None
baseline_error: str | None = None
if baseline_excerpt:
try:
bl_md = f"# 章节:{bl.title if bl else ch.title}\n\n{baseline_excerpt}"
bl_res = await judge.judge_memoir_result(
memoir_markdown=bl_md,
source_transcript=formatted.source_transcript,
structured_evidence=formatted.structured_evidence,
evidence_notes=ev_notes,
)
baseline_judge_obj = bl_res.output
baseline_judge_dict = (
baseline_judge_obj.model_dump()
if baseline_judge_obj
else None
)
if bl_res.error:
baseline_error = bl_res.error
except Exception as exc:
logger.warning(
"memoir stream: baseline judge failed ch={} err={}",
ch.id,
exc,
)
baseline_error = str(exc)
try:
cj_res = await judge.judge_memoir_result(
memoir_markdown=md,
source_transcript=formatted.source_transcript,
structured_evidence=formatted.structured_evidence,
reference_memoir_markdown=baseline_excerpt,
evidence_notes=ev_notes,
)
except Exception as e:
logger.exception(
"manual memoir stream: chapter judge failed user_id={} chapter_id={}",
uid,
ch.id,
)
await result_queue.put(
{
"event": "chapter_error",
"chapter_id": ch.id,
"title": ch.title,
"message": f"评审失败:{e}",
}
)
return
cj = cj_res.output
compare_summary = build_memoir_compare_summary(
baseline_judge=baseline_judge_obj,
chapter_judge=cj,
)
row: dict[str, Any] = {
"id": ch.id,
"title": ch.title,
"order_index": ch.order_index,
"baseline_title": bl.title if bl else None,
"lineage_tier": cb2.lineage_tier,
"evidence_summary": formatted.evidence_summary,
"evidence_trace": cb2.model_dump(),
"format_meta": fm.model_dump(),
"baseline_judge": baseline_judge_dict,
"judge": cj.model_dump() if cj else None,
"compare_summary": compare_summary,
}
if baseline_error:
row["baseline_judge_error"] = baseline_error
if cj_res.error:
row["judge_error"] = cj_res.error
if not cj and not cj_res.error:
row["judge_error"] = "empty_output"
await result_queue.put(
{
"event": "chapter_judge",
"index": idx,
"chapter": row,
"ok": cj is not None,
}
)
tasks = [asyncio.create_task(_judge_one(i, p)) for i, p in enumerate(prepared)]
finished = 0
total = len(tasks)
while finished < total:
item = await result_queue.get()
if item is not None:
yield item
finished_now = sum(1 for t in tasks if t.done())
if finished_now > finished:
finished = finished_now
for t in tasks:
await t
while not result_queue.empty():
item = result_queue.get_nowait()
if item is not None:
yield item
yield {"event": "done"}
async def memoir_snapshot(self, user_id: str) -> dict[str, Any]:
uid = (user_id or "").strip()
if not uid:
raise EvaluationBadRequestError("user_id is required")
chapters_out: list[dict[str, Any]] = []
stories_out: list[dict[str, Any]] = []
try:
chapters = await get_chapters_for_memoir_list(
uid, self._db, active_only=True, is_new_only=None
)
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
chapters_out.append(
{
"id": ch.id,
"title": ch.title,
"category": ch.category,
"order_index": ch.order_index,
"canonical_markdown": ch.canonical_markdown,
}
)
except Exception as e:
logger.warning("memoir snapshot chapters failed: {}", e)
try:
stories = await get_stories_for_user(self._db, uid, status="active")
for st in stories[:_MAX_EVAL_STORIES]:
stories_out.append(
{
"id": st.id,
"title": st.title,
"stage": st.stage,
"canonical_markdown": st.canonical_markdown,
}
)
except Exception as e:
logger.warning("memoir snapshot stories failed: {}", e)
return {
"user_id": uid,
"chapters": chapters_out,
"stories": stories_out,
}