Files
life-echo/api/app/features/evaluation/judge_service.py
Kevin 064ad2161d refactor(eval+memoir):精简内部评测路由与服务,composite/对话摘要与 judge 能力补强
- 访谈:新增 interview_state_hints,联动 orchestrator 与提示词
- 回忆录:story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建:开发用 celery broker、compose/development 脚本、依赖注入
- eval-web:移除数据集/实验/版本等页面与流式轮询,突出 Playground
- 文档与单测同步
2026-04-08 21:36:12 +08:00

402 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""评测台评审:智谱 / DeepSeek 等 OpenAI 兼容端点(结构化 JSON"""
from __future__ import annotations
from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import Any, Generic, TypeVar
from app.core.config import settings
from app.core.llm_call import LLMCallError, allm_json_call
from app.core.logging import get_logger
from app.features.evaluation.judge_schemas import (
ConversationJudgeOutput,
MemoirJudgeOutput,
TurnJudgeOutput,
)
from app.features.evaluation.rubrics.conversation_v1 import (
COMPARE_CONV_STREAM_HINT,
CONV_JUDGE_INSTRUCTIONS,
TURN_JUDGE_INSTRUCTIONS,
)
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
logger = get_logger(__name__)
TJudgeOutput = TypeVar(
"TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
)
_TURN_MAX = 768
_CONV_JUDGE_JSON_MAX = 2048
_CONV_HEADER = "【完整对话】(每轮以 `[Turn k]` 开头)\n\n"
_MEMOIR_MAX = 12000
_MEMOIR_JSON_MAX = 1536
_COMPARE_STREAM_MAX = 6144
_MEMOIR_EVIDENCE_MAX = 12000
def _eval_judge_prompt_char_pool_for_context(context_window_tokens: int) -> int:
"""整段请求的字符预算(由评审模型 context window 推导,保守)。"""
toks = (
int(context_window_tokens)
- settings.eval_judge_completion_reserve_tokens
- settings.eval_judge_prompt_budget_safety_tokens
)
toks = max(1, toks)
return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))
def _eval_judge_prompt_char_pool() -> int:
return _eval_judge_prompt_char_pool_for_context(
settings.eval_judge_context_window_tokens
)
def eval_judge_conversation_transcript_max_chars() -> int:
"""整段对话评审【完整对话】transcript 最大字符数(默认 GLM 上下文)。"""
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
return max(1, _eval_judge_prompt_char_pool() - overhead)
def eval_judge_conversation_transcript_max_chars_for_context(
context_window_tokens: int,
) -> int:
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
return max(1, pool - overhead)
def eval_judge_turn_prior_transcript_max_chars() -> int:
"""逐轮评审:截至上一轮的 transcript 节选上限(默认 GLM 上下文)。"""
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
return max(1, _eval_judge_prompt_char_pool() - static)
def eval_judge_turn_prior_transcript_max_chars_for_context(
context_window_tokens: int,
) -> int:
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
return max(1, pool - static)
def eval_judge_compare_transcript_each_max_chars() -> int:
"""A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(默认 GLM 上下文)。"""
if settings.eval_judge_max_compare_transcript_chars_each > 0:
return settings.eval_judge_max_compare_transcript_chars_each
pool = (
_eval_judge_prompt_char_pool()
- settings.eval_judge_compare_prompt_overhead_chars
)
return max(1, pool // 2)
def eval_judge_compare_transcript_each_max_chars_for_context(
context_window_tokens: int,
) -> int:
if settings.eval_judge_max_compare_transcript_chars_each > 0:
return settings.eval_judge_max_compare_transcript_chars_each
pool = (
_eval_judge_prompt_char_pool_for_context(context_window_tokens)
- settings.eval_judge_compare_prompt_overhead_chars
)
return max(1, pool // 2)
@dataclass(slots=True)
class JudgeCallResult(Generic[TJudgeOutput]):
output: TJudgeOutput | None
error: str | None = None
def _judge_error_message(e: LLMCallError) -> str:
prefix = {
"invoke": "模型调用失败",
"decode": "JSON 解析失败",
"validation": "结果校验失败",
}.get(e.kind, "评审失败")
detail = str(e).strip()
return f"{prefix}: {detail}" if detail else prefix
def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
struct = (structured_evidence or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [
MEMOIR_JUDGE_INSTRUCTIONS,
"",
"【证据与输入顺序】以下区块按优先级给出:"
"评审说明(若有)→ 原始访谈/对话证据segment 绑定)→ 结构化记忆证据chunk/fact/timeline/summary"
"→ 参考基线(若有)→ 待评成稿。**真实性、覆盖率、可追溯性以「artifact 绑定证据闭包」为准**"
"若存在 `lineage_tier=fallback` 或证据不足,须保守打分并写 `insufficient_evidence`。",
"",
]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/对话证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/对话证据】",
"无可用局部对话证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if struct:
sections.extend(
["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""]
)
else:
sections.extend(
[
"【结构化记忆证据】",
"(本 artifact 未绑定或未解析到 chunk/fact/timeline/summary 证据。)",
"",
]
)
if reference:
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
return "\n".join(sections)
class EvalJudgeService:
def __init__(
self,
judge_llm: Any | None,
*,
context_window_tokens: int | None = None,
) -> None:
self._llm = judge_llm
self._ctx_tokens = int(
context_window_tokens or settings.eval_judge_context_window_tokens
)
def _conv_transcript_cap(self) -> int:
return eval_judge_conversation_transcript_max_chars_for_context(
self._ctx_tokens
)
def _turn_prior_cap(self) -> int:
return eval_judge_turn_prior_transcript_max_chars_for_context(
self._ctx_tokens
)
def _compare_each_cap(self) -> int:
return eval_judge_compare_transcript_each_max_chars_for_context(
self._ctx_tokens
)
async def judge_turn(
self,
*,
prior_transcript: str,
user_utterance: str,
assistant_reply: str,
turn_index_0: int = 0,
) -> TurnJudgeOutput | None:
if not self._llm:
return None
t = max(0, int(turn_index_0))
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致。evidence_refs.turn_index 请使用该编号。
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
{prior_transcript[: self._turn_prior_cap()]}
【本轮用户】
{user_utterance[:4000]}
【本轮 AI】
{assistant_reply[:4000]}
"""
try:
return await allm_json_call(
self._llm,
prompt,
TurnJudgeOutput,
max_tokens=_TURN_MAX,
agent="EvalJudgeService.judge_turn",
)
except LLMCallError as e:
logger.warning("turn judge failed: {}", e)
return None
async def judge_conversation_result(
self, *, full_transcript: str
) -> JudgeCallResult[ConversationJudgeOutput]:
if not self._llm:
return JudgeCallResult(
output=None,
error="评审模型未配置(智谱或 DeepSeek 密钥)",
)
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
【完整对话】(每轮以 `[Turn k]` 开头)
{full_transcript[: self._conv_transcript_cap()]}
"""
try:
out = await allm_json_call(
self._llm,
prompt,
ConversationJudgeOutput,
max_tokens=_CONV_JUDGE_JSON_MAX,
agent="EvalJudgeService.judge_conversation",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("conversation judge failed: {}", error)
return JudgeCallResult(output=None, error=error)
async def judge_conversation(
self, *, full_transcript: str
) -> ConversationJudgeOutput | None:
result = await self.judge_conversation_result(full_transcript=full_transcript)
return result.output
async def stream_conversation_compare(
self,
*,
baseline_transcript: str,
replay_transcript: str,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
) -> AsyncIterator[str]:
"""流式输出中文对比与建议(非 JSON"""
if not self._llm:
yield "[错误] 未配置评审模型 API Key智谱eval_judge_api_key / zhipu_api_keyDeepSeekdeepseek_api_key"
return
cap_each = self._compare_each_cap()
cap_single = self._conv_transcript_cap()
b_tr = (baseline_transcript or "").strip()[:cap_each]
r_tr = (replay_transcript or "").strip()[:cap_each]
b_json = (
baseline_judge.model_dump_json(ensure_ascii=False)
if baseline_judge
else "null"
)
r_json = (
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
)
if baseline_judge and replay_judge:
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分JSON。请用中文直接写正文不要用 JSON、不要用 Markdown 代码块):
【A导出基准对话】历史快照用户与当时导出的线上 AI多轮合并为一篇
{b_tr}
【B本次回放/新测对话】用户句与基准对齐AI 为当前后端重新生成)
{r_tr}
【A 的整体评分 JSON】
{b_json}
【B 的整体评分 JSON】
{r_json}
请依次撰写:
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
2) B 相对 A 的优点与不足;
3) 若 B 在关键维度明显弱于 A给出可操作的改进方向系统提示、访谈策略、模型或温度等
笔调简洁、偏执行清单。"""
elif replay_judge:
r_one = (replay_transcript or "").strip()[:cap_single]
prompt = f"""{COMPARE_CONV_STREAM_HINT}
【回放/新测 transcript】
{r_one}
【整体评分 JSON】
{r_json}
"""
else:
yield "[错误] 缺少回放对话评分,无法生成建议"
return
llm = self._llm
if hasattr(llm, "bind"):
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
try:
async for chunk in llm.astream(prompt):
piece = getattr(chunk, "content", None)
if piece:
yield piece
except Exception as e:
logger.warning("conversation compare stream failed: {}", e)
yield f"\n\n[流式输出中断:{e}]"
async def judge_memoir(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
result = await self.judge_memoir_result(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
structured_evidence=structured_evidence,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
return result.output
async def judge_memoir_result(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
structured_evidence: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> JudgeCallResult[MemoirJudgeOutput]:
if not self._llm:
return JudgeCallResult(
output=None,
error="评审模型未配置(智谱或 DeepSeek 密钥)",
)
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
structured_evidence=structured_evidence,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
try:
out = await allm_json_call(
self._llm,
prompt,
MemoirJudgeOutput,
max_tokens=_MEMOIR_JSON_MAX,
agent="EvalJudgeService.judge_memoir",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("memoir judge failed: {}", error)
return JudgeCallResult(output=None, error=error)