Files
life-echo/api/app/features/evaluation/judge_manual_service.py
2026-04-06 23:19:20 +08:00

373 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""手动触发 GLM 评审(不写 eval_runs"""
from __future__ import annotations
import re
from collections.abc import AsyncIterator
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.dependencies import get_eval_judge_langchain_llm
from app.core.logging import get_logger
from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
)
from app.features.evaluation.execution_service import _assistant_text_for_eval_display
from app.features.evaluation.judge_service import EvalJudgeService
from app.features.evaluation.schemas import MemoirSectionBaselineOut
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user
logger = get_logger(__name__)
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(已截断供评审)"
def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
parts: list[str] = []
for u, ai in turns:
u = (u or "").strip()
ai = (ai or "").strip()
if u:
parts.append(f"用户: {u}")
if ai:
parts.append(f"AI: {_assistant_text_for_eval_display(ai)}")
return "\n\n".join(parts)
def _normalize_title_key(title: str) -> str:
t = (title or "").strip().lower()
t = re.sub(r"^#+\s*", "", t)
return re.sub(r"\s+", " ", t)
def _baseline_for_chapter_title(
baselines: list[MemoirSectionBaselineOut],
chapter_title: str,
index: int,
) -> MemoirSectionBaselineOut | None:
if baselines:
key = _normalize_title_key(chapter_title)
for b in baselines:
if _normalize_title_key(b.title) == key:
return b
if 0 <= index < len(baselines):
return baselines[index]
return None
class EvalJudgeManualService:
def __init__(self, db: AsyncSession) -> None:
self._db = db
async def judge_conversation(
self,
conversation_id: str,
fixture_filename: str | None,
) -> dict[str, Any]:
cid = (conversation_id or "").strip()
if not cid:
raise EvaluationBadRequestError("conversation_id is required")
catalog = SessionCatalogService(self._db)
dialogue = await catalog.get_session_dialogue(cid)
if not dialogue:
raise EvaluationNotFoundError("conversation not found")
parts: list[str] = []
for m in dialogue.messages:
r = (m.role or "").lower()
label = "用户" if r == "human" else "AI"
raw = m.content or ""
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
parts.append(f"{label}: {out}")
replay_transcript = "\n\n".join(parts)
if not replay_transcript.strip():
raise EvaluationBadRequestError("no messages to judge")
fn = (fixture_filename or "").strip() or None
baseline_transcript = ""
if fn:
try:
turns, _ = read_user_export_fixture(fn)
baseline_transcript = _transcript_from_export_turns(turns)
except ValueError as e:
raise EvaluationBadRequestError(str(e)) from e
except FileNotFoundError as e:
raise EvaluationNotFoundError("fixture not found") from e
errors: list[str] = []
judge_llm = get_eval_judge_langchain_llm()
judge = EvalJudgeService(judge_llm)
baseline_judge_dict: dict[str, Any] | None = None
if baseline_transcript.strip():
bj = await judge.judge_conversation(full_transcript=baseline_transcript)
if bj:
baseline_judge_dict = bj.model_dump()
else:
errors.append("baseline_glm_failed")
elif fn:
errors.append("baseline_transcript_empty")
rj = await judge.judge_conversation(full_transcript=replay_transcript)
replay_judge_dict = rj.model_dump() if rj else None
if not rj:
errors.append("replay_glm_failed")
return {
"conversation_id": cid,
"fixture_filename": fn,
"baseline_transcript": baseline_transcript,
"replay_transcript": replay_transcript,
"baseline_judge": baseline_judge_dict,
"replay_judge": replay_judge_dict,
"errors": errors,
}
async def iter_conversation_judge_sse(
self,
conversation_id: str,
fixture_filename: str | None,
) -> AsyncIterator[dict[str, Any]]:
"""供 SSE先整体基准分、再整体回放分再流式对比与建议。"""
cid = (conversation_id or "").strip()
if not cid:
yield {
"event": "error",
"phase": "validate",
"message": "conversation_id is required",
}
return
catalog = SessionCatalogService(self._db)
dialogue = await catalog.get_session_dialogue(cid)
if not dialogue:
yield {
"event": "error",
"phase": "load",
"message": "conversation not found",
}
return
parts: list[str] = []
for m in dialogue.messages:
r = (m.role or "").lower()
label = "用户" if r == "human" else "AI"
raw = m.content or ""
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
parts.append(f"{label}: {out}")
replay_transcript = "\n\n".join(parts)
if not replay_transcript.strip():
yield {"event": "error", "phase": "load", "message": "no messages to judge"}
return
fn = (fixture_filename or "").strip() or None
baseline_transcript = ""
if fn:
try:
turns, _ = read_user_export_fixture(fn)
baseline_transcript = _transcript_from_export_turns(turns)
except ValueError as e:
yield {"event": "error", "phase": "fixture", "message": str(e)}
return
except FileNotFoundError:
yield {
"event": "error",
"phase": "fixture",
"message": "fixture not found",
}
return
judge_llm = get_eval_judge_langchain_llm()
if not judge_llm:
yield {
"event": "error",
"phase": "config",
"message": "评审 LLM 未配置eval_judge_api_key / zhipu_api_key",
}
return
judge = EvalJudgeService(judge_llm)
yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn}
if not baseline_transcript.strip():
yield {
"event": "warning",
"message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议",
}
baseline_judge = None
if baseline_transcript.strip():
baseline_judge = await judge.judge_conversation(
full_transcript=baseline_transcript
)
yield {
"event": "baseline_judge",
"ok": baseline_judge is not None,
"judge": baseline_judge.model_dump() if baseline_judge else None,
}
if not baseline_judge:
yield {
"event": "error",
"phase": "baseline_glm",
"message": "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)",
}
else:
yield {
"event": "baseline_judge",
"ok": False,
"skipped": True,
"judge": None,
}
replay_judge = await judge.judge_conversation(full_transcript=replay_transcript)
yield {
"event": "replay_judge",
"ok": replay_judge is not None,
"judge": replay_judge.model_dump() if replay_judge else None,
}
if not replay_judge:
yield {
"event": "error",
"phase": "replay_glm",
"message": "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)",
}
yield {"event": "done"}
return
async for piece in judge.stream_conversation_compare(
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
baseline_judge=baseline_judge,
replay_judge=replay_judge,
):
if piece:
yield {"event": "compare_delta", "text": piece}
yield {"event": "done"}
async def judge_memoir_for_user(
self,
user_id: str,
baseline_sections: list[MemoirSectionBaselineOut] | None,
) -> dict[str, Any]:
uid = (user_id or "").strip()
if not uid:
raise EvaluationBadRequestError("user_id is required")
judge_llm = get_eval_judge_langchain_llm()
judge = EvalJudgeService(judge_llm)
baselines = list(baseline_sections or [])
chapter_results: list[dict[str, Any]] = []
try:
chapters = await get_chapters_for_memoir_list(
uid, self._db, active_only=True, is_new_only=None
)
for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
body = (ch.canonical_markdown or "").strip()
if not body:
continue
bl = _baseline_for_chapter_title(baselines, ch.title or "", i)
baseline_excerpt = ""
if bl and (bl.body or "").strip():
baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
md = f"# 章节:{ch.title}\n\n"
if baseline_excerpt:
md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n"
md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
cj = await judge.judge_memoir(memoir_markdown=md)
chapter_results.append(
{
"id": ch.id,
"title": ch.title,
"order_index": ch.order_index,
"baseline_title": bl.title if bl else None,
"judge": cj.model_dump() if cj else None,
}
)
except Exception as e:
logger.warning("manual memoir chapter judges failed: {}", e)
story_results: list[dict[str, Any]] = []
try:
stories = await get_stories_for_user(self._db, uid, status="active")
for st in stories[:_MAX_EVAL_STORIES]:
body = (st.canonical_markdown or "").strip()
if not body:
continue
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
sj = await judge.judge_memoir(memoir_markdown=md)
story_results.append(
{
"id": st.id,
"title": st.title,
"stage": st.stage,
"judge": sj.model_dump() if sj else None,
}
)
except Exception as e:
logger.warning("manual memoir story judges failed: {}", e)
return {
"user_id": uid,
"chapter_results": chapter_results,
"story_results": story_results,
}
async def memoir_snapshot(self, user_id: str) -> dict[str, Any]:
uid = (user_id or "").strip()
if not uid:
raise EvaluationBadRequestError("user_id is required")
chapters_out: list[dict[str, Any]] = []
stories_out: list[dict[str, Any]] = []
try:
chapters = await get_chapters_for_memoir_list(
uid, self._db, active_only=True, is_new_only=None
)
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
chapters_out.append(
{
"id": ch.id,
"title": ch.title,
"category": ch.category,
"order_index": ch.order_index,
"canonical_markdown": ch.canonical_markdown,
}
)
except Exception as e:
logger.warning("memoir snapshot chapters failed: {}", e)
try:
stories = await get_stories_for_user(self._db, uid, status="active")
for st in stories[:_MAX_EVAL_STORIES]:
stories_out.append(
{
"id": st.id,
"title": st.title,
"stage": st.stage,
"canonical_markdown": st.canonical_markdown,
}
)
except Exception as e:
logger.warning("memoir snapshot stories failed: {}", e)
return {
"user_id": uid,
"chapters": chapters_out,
"stories": stories_out,
}