Files
life-echo/api/app/features/evaluation/judge_service.py
Kevin 309a051038 feat: 回忆录证据血缘与内部评测可追溯,顺带对齐本地评测台与 CI
数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。
内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。
app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。
工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
2026-04-08 15:37:09 +08:00

332 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""智谱 GLM-5 评审调用(结构化 JSON"""
from __future__ import annotations
from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import Any, Generic, TypeVar
from app.core.config import settings
from app.core.llm_call import LLMCallError, allm_json_call
from app.core.logging import get_logger
from app.features.evaluation.judge_schemas import (
ConversationJudgeOutput,
MemoirJudgeOutput,
TurnJudgeOutput,
)
from app.features.evaluation.rubrics.conversation_v1 import (
COMPARE_CONV_STREAM_HINT,
CONV_JUDGE_INSTRUCTIONS,
TURN_JUDGE_INSTRUCTIONS,
)
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
logger = get_logger(__name__)
TJudgeOutput = TypeVar(
"TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
)
_TURN_MAX = 768
_CONV_JUDGE_JSON_MAX = 2048
_CONV_HEADER = "【完整对话】(每轮以 `[Turn k]` 开头)\n\n"
_MEMOIR_MAX = 12000
_MEMOIR_JSON_MAX = 1536
_COMPARE_STREAM_MAX = 6144
_MEMOIR_EVIDENCE_MAX = 12000
def _eval_judge_prompt_char_pool() -> int:
"""整段请求的字符预算(由评审模型 context window 推导,保守)。"""
toks = (
settings.eval_judge_context_window_tokens
- settings.eval_judge_completion_reserve_tokens
- settings.eval_judge_prompt_budget_safety_tokens
)
toks = max(1, toks)
return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))
def eval_judge_conversation_transcript_max_chars() -> int:
"""整段对话评审【完整对话】transcript 最大字符数。"""
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
return max(1, _eval_judge_prompt_char_pool() - overhead)
def eval_judge_turn_prior_transcript_max_chars() -> int:
"""逐轮评审:截至上一轮的 transcript 节选上限(含与用户/助手正文头的固定开销)。"""
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
return max(1, _eval_judge_prompt_char_pool() - static)
def eval_judge_compare_transcript_each_max_chars() -> int:
"""A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(均分字符预算)。"""
if settings.eval_judge_max_compare_transcript_chars_each > 0:
return settings.eval_judge_max_compare_transcript_chars_each
pool = _eval_judge_prompt_char_pool() - settings.eval_judge_compare_prompt_overhead_chars
return max(1, pool // 2)
@dataclass(slots=True)
class JudgeCallResult(Generic[TJudgeOutput]):
output: TJudgeOutput | None
error: str | None = None
def _judge_error_message(e: LLMCallError) -> str:
prefix = {
"invoke": "模型调用失败",
"decode": "JSON 解析失败",
"validation": "结果校验失败",
}.get(e.kind, "评审失败")
detail = str(e).strip()
return f"{prefix}: {detail}" if detail else prefix
def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
struct = (structured_evidence or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [
MEMOIR_JUDGE_INSTRUCTIONS,
"",
"【证据与输入顺序】以下区块按优先级给出:"
"评审说明(若有)→ 原始访谈/对话证据segment 绑定)→ 结构化记忆证据chunk/fact/timeline/summary"
"→ 参考基线(若有)→ 待评成稿。**真实性、覆盖率、可追溯性以「artifact 绑定证据闭包」为准**"
"若存在 `lineage_tier=fallback` 或证据不足,须保守打分并写 `insufficient_evidence`。",
"",
]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/对话证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/对话证据】",
"无可用局部对话证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if struct:
sections.extend(
["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""]
)
else:
sections.extend(
[
"【结构化记忆证据】",
"(本 artifact 未绑定或未解析到 chunk/fact/timeline/summary 证据。)",
"",
]
)
if reference:
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
return "\n".join(sections)
class EvalJudgeService:
def __init__(self, judge_llm: Any | None) -> None:
self._llm = judge_llm
async def judge_turn(
self,
*,
prior_transcript: str,
user_utterance: str,
assistant_reply: str,
turn_index_0: int = 0,
) -> TurnJudgeOutput | None:
if not self._llm:
return None
t = max(0, int(turn_index_0))
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致。evidence_refs.turn_index 请使用该编号。
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
{prior_transcript[: eval_judge_turn_prior_transcript_max_chars()]}
【本轮用户】
{user_utterance[:4000]}
【本轮 AI】
{assistant_reply[:4000]}
"""
try:
return await allm_json_call(
self._llm,
prompt,
TurnJudgeOutput,
max_tokens=_TURN_MAX,
agent="EvalJudgeService.judge_turn",
)
except LLMCallError as e:
logger.warning("turn judge failed: {}", e)
return None
async def judge_conversation_result(
self, *, full_transcript: str
) -> JudgeCallResult[ConversationJudgeOutput]:
if not self._llm:
return JudgeCallResult(output=None, error="评审模型未配置")
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
【完整对话】(每轮以 `[Turn k]` 开头)
{full_transcript[: eval_judge_conversation_transcript_max_chars()]}
"""
try:
out = await allm_json_call(
self._llm,
prompt,
ConversationJudgeOutput,
max_tokens=_CONV_JUDGE_JSON_MAX,
agent="EvalJudgeService.judge_conversation",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("conversation judge failed: {}", error)
return JudgeCallResult(output=None, error=error)
async def judge_conversation(
self, *, full_transcript: str
) -> ConversationJudgeOutput | None:
result = await self.judge_conversation_result(full_transcript=full_transcript)
return result.output
async def stream_conversation_compare(
self,
*,
baseline_transcript: str,
replay_transcript: str,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
) -> AsyncIterator[str]:
"""流式输出中文对比与建议(非 JSON"""
if not self._llm:
yield "[错误] 未配置评审模型 API Keyeval_judge_api_key / zhipu_api_key"
return
cap_each = eval_judge_compare_transcript_each_max_chars()
cap_single = eval_judge_conversation_transcript_max_chars()
b_tr = (baseline_transcript or "").strip()[:cap_each]
r_tr = (replay_transcript or "").strip()[:cap_each]
b_json = (
baseline_judge.model_dump_json(ensure_ascii=False)
if baseline_judge
else "null"
)
r_json = (
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
)
if baseline_judge and replay_judge:
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分JSON。请用中文直接写正文不要用 JSON、不要用 Markdown 代码块):
【A导出基准对话】历史快照用户与当时导出的线上 AI多轮合并为一篇
{b_tr}
【B本次回放/新测对话】用户句与基准对齐AI 为当前后端重新生成)
{r_tr}
【A 的整体评分 JSON】
{b_json}
【B 的整体评分 JSON】
{r_json}
请依次撰写:
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
2) B 相对 A 的优点与不足;
3) 若 B 在关键维度明显弱于 A给出可操作的改进方向系统提示、访谈策略、模型或温度等
笔调简洁、偏执行清单。"""
elif replay_judge:
r_one = (replay_transcript or "").strip()[:cap_single]
prompt = f"""{COMPARE_CONV_STREAM_HINT}
【回放/新测 transcript】
{r_one}
【整体评分 JSON】
{r_json}
"""
else:
yield "[错误] 缺少回放对话评分,无法生成建议"
return
llm = self._llm
if hasattr(llm, "bind"):
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
try:
async for chunk in llm.astream(prompt):
piece = getattr(chunk, "content", None)
if piece:
yield piece
except Exception as e:
logger.warning("conversation compare stream failed: {}", e)
yield f"\n\n[流式输出中断:{e}]"
async def judge_memoir(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
result = await self.judge_memoir_result(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
structured_evidence=structured_evidence,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
return result.output
async def judge_memoir_result(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> JudgeCallResult[MemoirJudgeOutput]:
if not self._llm:
return JudgeCallResult(output=None, error="评审模型未配置")
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
structured_evidence=structured_evidence,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
try:
out = await allm_json_call(
self._llm,
prompt,
MemoirJudgeOutput,
max_tokens=_MEMOIR_JSON_MAX,
agent="EvalJudgeService.judge_memoir",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("memoir judge failed: {}", error)
return JudgeCallResult(output=None, error=error)