Files
life-echo/api/app/features/evaluation/judge_service.py
Kevin 99543d04c6 feat(eval): internal-eval stack, judge fixes, and eval web overhaul
- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.
2026-04-07 17:18:47 +08:00

272 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""GLM 评审调用(结构化 JSON"""
from __future__ import annotations
from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import Any, Generic, TypeVar
from app.core.llm_call import LLMCallError, allm_json_call
from app.core.logging import get_logger
from app.features.evaluation.judge_schemas import (
ConversationJudgeOutput,
MemoirJudgeOutput,
TurnJudgeOutput,
)
from app.features.evaluation.rubrics.conversation_v1 import (
COMPARE_CONV_STREAM_HINT,
CONV_JUDGE_INSTRUCTIONS,
TURN_JUDGE_INSTRUCTIONS,
)
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
logger = get_logger(__name__)
TJudgeOutput = TypeVar(
"TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
)
_TURN_MAX = 768
_CONV_MAX = 8192
_CONV_JUDGE_JSON_MAX = 2048
_MEMOIR_MAX = 12000
_MEMOIR_JSON_MAX = 1536
_COMPARE_STREAM_MAX = 6144
_MEMOIR_EVIDENCE_MAX = 12000
@dataclass(slots=True)
class JudgeCallResult(Generic[TJudgeOutput]):
output: TJudgeOutput | None
error: str | None = None
def _judge_error_message(e: LLMCallError) -> str:
prefix = {
"invoke": "模型调用失败",
"decode": "JSON 解析失败",
"validation": "结果校验失败",
}.get(e.kind, "评审失败")
detail = str(e).strip()
return f"{prefix}: {detail}" if detail else prefix
def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [
MEMOIR_JUDGE_INSTRUCTIONS,
"",
"【证据与输入顺序】以下区块按优先级给出:评审说明(若有)→ 原始访谈证据 → 参考基线(若有)→ 待评成稿。**真实性相关细项必须以原始访谈证据为准。**",
"",
]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/证据】",
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if reference:
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
return "\n".join(sections)
class EvalJudgeService:
def __init__(self, judge_llm: Any | None) -> None:
self._llm = judge_llm
async def judge_turn(
self,
*,
prior_transcript: str,
user_utterance: str,
assistant_reply: str,
turn_index_0: int = 0,
) -> TurnJudgeOutput | None:
if not self._llm:
return None
t = max(0, int(turn_index_0))
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致。evidence_refs.turn_index 请使用该编号。
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
{prior_transcript[:_CONV_MAX]}
【本轮用户】
{user_utterance[:4000]}
【本轮 AI】
{assistant_reply[:4000]}
"""
try:
return await allm_json_call(
self._llm,
prompt,
TurnJudgeOutput,
max_tokens=_TURN_MAX,
agent="EvalJudgeService.judge_turn",
)
except LLMCallError as e:
logger.warning("turn judge failed: {}", e)
return None
async def judge_conversation_result(
self, *, full_transcript: str
) -> JudgeCallResult[ConversationJudgeOutput]:
if not self._llm:
return JudgeCallResult(output=None, error="评审模型未配置")
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
【完整对话】(每轮以 `[Turn k]` 开头)
{full_transcript[:_CONV_MAX]}
"""
try:
out = await allm_json_call(
self._llm,
prompt,
ConversationJudgeOutput,
max_tokens=_CONV_JUDGE_JSON_MAX,
agent="EvalJudgeService.judge_conversation",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("conversation judge failed: {}", error)
return JudgeCallResult(output=None, error=error)
async def judge_conversation(
self, *, full_transcript: str
) -> ConversationJudgeOutput | None:
result = await self.judge_conversation_result(full_transcript=full_transcript)
return result.output
async def stream_conversation_compare(
self,
*,
baseline_transcript: str,
replay_transcript: str,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
) -> AsyncIterator[str]:
"""流式输出中文对比与建议(非 JSON"""
if not self._llm:
yield "[错误] 未配置评审模型 API Keyeval_judge_api_key / zhipu_api_key"
return
b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
b_json = (
baseline_judge.model_dump_json(ensure_ascii=False)
if baseline_judge
else "null"
)
r_json = (
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
)
if baseline_judge and replay_judge:
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分JSON。请用中文直接写正文不要用 JSON、不要用 Markdown 代码块):
【A导出基准对话】历史快照用户与当时导出的线上 AI多轮合并为一篇
{b_tr}
【B本次回放/新测对话】用户句与基准对齐AI 为当前后端重新生成)
{r_tr}
【A 的整体评分 JSON】
{b_json}
【B 的整体评分 JSON】
{r_json}
请依次撰写:
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
2) B 相对 A 的优点与不足;
3) 若 B 在关键维度明显弱于 A给出可操作的改进方向系统提示、访谈策略、模型或温度等
笔调简洁、偏执行清单。"""
elif replay_judge:
prompt = f"""{COMPARE_CONV_STREAM_HINT}
【回放/新测 transcript】
{r_tr}
【整体评分 JSON】
{r_json}
"""
else:
yield "[错误] 缺少回放对话评分,无法生成建议"
return
llm = self._llm
if hasattr(llm, "bind"):
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
try:
async for chunk in llm.astream(prompt):
piece = getattr(chunk, "content", None)
if piece:
yield piece
except Exception as e:
logger.warning("conversation compare stream failed: {}", e)
yield f"\n\n[流式输出中断:{e}]"
async def judge_memoir(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
result = await self.judge_memoir_result(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
return result.output
async def judge_memoir_result(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> JudgeCallResult[MemoirJudgeOutput]:
if not self._llm:
return JudgeCallResult(output=None, error="评审模型未配置")
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
try:
out = await allm_json_call(
self._llm,
prompt,
MemoirJudgeOutput,
max_tokens=_MEMOIR_JSON_MAX,
agent="EvalJudgeService.judge_memoir",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("memoir judge failed: {}", error)
return JudgeCallResult(output=None, error=error)