"""手动触发 GLM-5 评审(不写 eval_runs)。""" from __future__ import annotations import re from collections.abc import AsyncIterator from typing import Any from sqlalchemy.ext.asyncio import AsyncSession from app.core.dependencies import get_eval_judge_langchain_llm from app.core.logging import get_logger from app.features.conversation import repo as conversation_repo from app.features.evaluation.errors import ( EvaluationBadRequestError, EvaluationNotFoundError, ) from app.features.evaluation.judge_service import EvalJudgeService from app.features.evaluation.transcript_for_judge import ( assistant_text_for_eval_display, format_eval_turn_block, format_export_turns_with_labels, format_session_messages_with_turn_labels, pair_session_messages_to_turns, ) from app.features.evaluation.schemas import MemoirSectionBaselineOut from app.features.evaluation.session_catalog_service import SessionCatalogService from app.features.evaluation.user_export_fixtures import read_user_export_fixture from app.features.memoir.repo import get_chapters_for_memoir_list from app.features.story.repo import get_stories_for_user logger = get_logger(__name__) _MAX_JUDGE_MARKDOWN_CHARS = 20_000 _MAX_EVAL_CHAPTERS = 30 _MAX_EVAL_STORIES = 40 _MAX_EVIDENCE_CONVERSATIONS = 8 _MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000 _PRIOR_TRANSCRIPT_MAX_CHARS = 8000 async def _iter_turn_judgments_for_turns( judge: EvalJudgeService, turns: list[tuple[str, str]], *, sse_event: str, ) -> AsyncIterator[dict[str, Any]]: """与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。""" prior_blocks: list[str] = [] for idx, (u_raw, ai_raw) in enumerate(turns): u = (u_raw or "").strip() reply = assistant_text_for_eval_display(str(ai_raw)) prior = "\n\n".join(prior_blocks) if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS: prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:] tj = await judge.judge_turn( prior_transcript=prior, user_utterance=u, assistant_reply=reply, turn_index_0=idx, ) yield { "event": sse_event, "turn_index": idx, "ok": tj is not None, "judge": tj.model_dump() if tj else None, } prior_blocks.append(format_eval_turn_block(idx, u, reply)) def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str: s = (text or "").strip() if len(s) <= max_chars: return s return f"{s[:max_chars]}\n\n…(已截断供评审)" def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str: s = (text or "").strip() if len(s) <= max_chars: return s return f"{s[:max_chars]}\n\n…(访谈证据已截断)" async def _conversation_transcript_for_manual( db: AsyncSession, conversation_id: str ) -> str: rows = await conversation_repo.get_conversation_messages(conversation_id, db) return format_session_messages_with_turn_labels(rows) async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str: conversations = await conversation_repo.get_user_conversations(user_id, db) if not conversations: return "" parts: list[str] = [] for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]): transcript = await _conversation_transcript_for_manual(db, str(conv.id)) if transcript: parts.append(f"## 会话 {str(conv.id)}\n{transcript}") return _trim_evidence_text("\n\n".join(parts)) def _normalize_title_key(title: str) -> str: t = (title or "").strip().lower() t = re.sub(r"^#+\s*", "", t) return re.sub(r"\s+", " ", t) def _baseline_for_chapter_title( baselines: list[MemoirSectionBaselineOut], chapter_title: str, index: int, ) -> MemoirSectionBaselineOut | None: if baselines: key = _normalize_title_key(chapter_title) for b in baselines: if _normalize_title_key(b.title) == key: return b if 0 <= index < len(baselines): return baselines[index] return None class EvalJudgeManualService: def __init__(self, db: AsyncSession) -> None: self._db = db async def judge_conversation( self, conversation_id: str, fixture_filename: str | None, ) -> dict[str, Any]: cid = (conversation_id or "").strip() if not cid: raise EvaluationBadRequestError("conversation_id is required") catalog = SessionCatalogService(self._db) dialogue = await catalog.get_session_dialogue(cid) if not dialogue: raise EvaluationNotFoundError("conversation not found") replay_transcript = format_session_messages_with_turn_labels( list(dialogue.messages) ) if not replay_transcript.strip(): raise EvaluationBadRequestError("no messages to judge") fn = (fixture_filename or "").strip() or None baseline_transcript = "" if fn: try: turns, _ = read_user_export_fixture(fn) baseline_transcript = format_export_turns_with_labels(turns) except ValueError as e: raise EvaluationBadRequestError(str(e)) from e except FileNotFoundError as e: raise EvaluationNotFoundError("fixture not found") from e errors: list[str] = [] judge_llm = get_eval_judge_langchain_llm() judge = EvalJudgeService(judge_llm) baseline_judge_dict: dict[str, Any] | None = None if baseline_transcript.strip(): baseline_result = await judge.judge_conversation_result( full_transcript=baseline_transcript ) bj = baseline_result.output if bj: baseline_judge_dict = bj.model_dump() else: errors.append( f"baseline_glm5_failed: {baseline_result.error or 'unknown error'}" ) elif fn: errors.append("baseline_transcript_empty") replay_result = await judge.judge_conversation_result( full_transcript=replay_transcript ) rj = replay_result.output replay_judge_dict = rj.model_dump() if rj else None if not rj: errors.append( f"replay_glm5_failed: {replay_result.error or 'unknown error'}" ) return { "conversation_id": cid, "fixture_filename": fn, "baseline_transcript": baseline_transcript, "replay_transcript": replay_transcript, "baseline_judge": baseline_judge_dict, "replay_judge": replay_judge_dict, "errors": errors, } async def iter_conversation_judge_sse( self, conversation_id: str, fixture_filename: str | None, *, include_turn_judges: bool = False, include_baseline_turn_judges: bool = False, ) -> AsyncIterator[dict[str, Any]]: """供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议。""" cid = (conversation_id or "").strip() if not cid: yield { "event": "error", "phase": "validate", "message": "conversation_id is required", } return catalog = SessionCatalogService(self._db) dialogue = await catalog.get_session_dialogue(cid) if not dialogue: yield { "event": "error", "phase": "load", "message": "conversation not found", } return replay_transcript = format_session_messages_with_turn_labels( list(dialogue.messages) ) if not replay_transcript.strip(): yield {"event": "error", "phase": "load", "message": "no messages to judge"} return fn = (fixture_filename or "").strip() or None baseline_transcript = "" export_turns: list[tuple[str, str]] | None = None if fn: try: turns, _ = read_user_export_fixture(fn) export_turns = list(turns) baseline_transcript = format_export_turns_with_labels(turns) except ValueError as e: yield {"event": "error", "phase": "fixture", "message": str(e)} return except FileNotFoundError: yield { "event": "error", "phase": "fixture", "message": "fixture not found", } return judge_llm = get_eval_judge_langchain_llm() if not judge_llm: yield { "event": "error", "phase": "config", "message": "评审 LLM 未配置(eval_judge_api_key / zhipu_api_key)", } return judge = EvalJudgeService(judge_llm) yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn} if not baseline_transcript.strip(): yield { "event": "warning", "message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议", } baseline_judge = None if baseline_transcript.strip(): baseline_result = await judge.judge_conversation_result( full_transcript=baseline_transcript ) baseline_judge = baseline_result.output yield { "event": "baseline_judge", "ok": baseline_judge is not None, "judge": baseline_judge.model_dump() if baseline_judge else None, } if not baseline_judge: yield { "event": "error", "phase": "baseline_glm5", "message": ( f"基准整体打分失败:{baseline_result.error}" if baseline_result.error else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)" ), } elif ( include_baseline_turn_judges and export_turns and baseline_judge is not None ): yield {"event": "meta", "phase": "baseline_turn_judges_start"} async for row in _iter_turn_judgments_for_turns( judge, export_turns, sse_event="baseline_turn_judge", ): yield row else: yield { "event": "baseline_judge", "ok": False, "skipped": True, "judge": None, } replay_result = await judge.judge_conversation_result( full_transcript=replay_transcript ) replay_judge = replay_result.output yield { "event": "replay_judge", "ok": replay_judge is not None, "judge": replay_judge.model_dump() if replay_judge else None, } if not replay_judge: yield { "event": "error", "phase": "replay_glm5", "message": ( f"回放对话整体 GLM-5 打分失败:{replay_result.error}" if replay_result.error else "回放对话整体 GLM-5 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)" ), } yield {"event": "done"} return if include_turn_judges: replay_pairs = pair_session_messages_to_turns(list(dialogue.messages)) if replay_pairs: yield {"event": "meta", "phase": "replay_turn_judges_start"} async for row in _iter_turn_judgments_for_turns( judge, replay_pairs, sse_event="replay_turn_judge", ): yield row async for piece in judge.stream_conversation_compare( baseline_transcript=baseline_transcript, replay_transcript=replay_transcript, baseline_judge=baseline_judge, replay_judge=replay_judge, ): if piece: yield {"event": "compare_delta", "text": piece} yield {"event": "done"} async def judge_memoir_for_user( self, user_id: str, baseline_sections: list[MemoirSectionBaselineOut] | None, ) -> dict[str, Any]: uid = (user_id or "").strip() if not uid: raise EvaluationBadRequestError("user_id is required") judge_llm = get_eval_judge_langchain_llm() judge = EvalJudgeService(judge_llm) baselines = list(baseline_sections or []) evidence_transcript = await _user_transcript_evidence(self._db, uid) chapter_results: list[dict[str, Any]] = [] try: chapters = await get_chapters_for_memoir_list( uid, self._db, active_only=True, is_new_only=None ) for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]): body = (ch.canonical_markdown or "").strip() if not body: continue bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i) baseline_excerpt = "" if bl and (bl.body or "").strip(): baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000) md = f"# 章节:{ch.title}\n\n" if baseline_excerpt: md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n" md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}" cj = await judge.judge_memoir( memoir_markdown=md, source_transcript=evidence_transcript, reference_memoir_markdown=baseline_excerpt, evidence_notes=( "严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。" ), ) chapter_results.append( { "id": ch.id, "title": ch.title, "order_index": ch.order_index, "baseline_title": bl.title if bl else None, "judge": cj.model_dump() if cj else None, } ) except Exception as e: logger.warning("manual memoir chapter judges failed: {}", e) story_results: list[dict[str, Any]] = [] try: stories = await get_stories_for_user(self._db, uid, status="active") for st in stories[:_MAX_EVAL_STORIES]: body = (st.canonical_markdown or "").strip() if not body: continue md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}" sj = await judge.judge_memoir( memoir_markdown=md, source_transcript=evidence_transcript, evidence_notes=( "严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。" ), ) story_results.append( { "id": st.id, "title": st.title, "stage": st.stage, "judge": sj.model_dump() if sj else None, } ) except Exception as e: logger.warning("manual memoir story judges failed: {}", e) return { "user_id": uid, "chapter_results": chapter_results, "story_results": story_results, } async def memoir_snapshot(self, user_id: str) -> dict[str, Any]: uid = (user_id or "").strip() if not uid: raise EvaluationBadRequestError("user_id is required") chapters_out: list[dict[str, Any]] = [] stories_out: list[dict[str, Any]] = [] try: chapters = await get_chapters_for_memoir_list( uid, self._db, active_only=True, is_new_only=None ) for ch in chapters[:_MAX_EVAL_CHAPTERS]: chapters_out.append( { "id": ch.id, "title": ch.title, "category": ch.category, "order_index": ch.order_index, "canonical_markdown": ch.canonical_markdown, } ) except Exception as e: logger.warning("memoir snapshot chapters failed: {}", e) try: stories = await get_stories_for_user(self._db, uid, status="active") for st in stories[:_MAX_EVAL_STORIES]: stories_out.append( { "id": st.id, "title": st.title, "stage": st.stage, "canonical_markdown": st.canonical_markdown, } ) except Exception as e: logger.warning("memoir snapshot stories failed: {}", e) return { "user_id": uid, "chapters": chapters_out, "stories": stories_out, }