2026-04-08 21:36:12 +08:00
|
|
|
|
"""评测台评审:智谱 / DeepSeek 等 OpenAI 兼容端点(结构化 JSON)。"""
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-04-06 23:19:20 +08:00
|
|
|
|
from collections.abc import AsyncIterator
|
2026-04-07 17:15:01 +08:00
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
from typing import Any, Generic, TypeVar
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
2026-04-08 09:38:07 +08:00
|
|
|
|
from app.core.config import settings
|
2026-04-27 14:34:30 +08:00
|
|
|
|
from app.core.eval_judge_spec import EvalJudgeProvider
|
2026-04-03 14:44:46 +08:00
|
|
|
|
from app.core.llm_call import LLMCallError, allm_json_call
|
|
|
|
|
|
from app.core.logging import get_logger
|
|
|
|
|
|
from app.features.evaluation.judge_schemas import (
|
|
|
|
|
|
ConversationJudgeOutput,
|
|
|
|
|
|
MemoirJudgeOutput,
|
|
|
|
|
|
TurnJudgeOutput,
|
|
|
|
|
|
)
|
|
|
|
|
|
from app.features.evaluation.rubrics.conversation_v1 import (
|
2026-04-06 23:19:20 +08:00
|
|
|
|
COMPARE_CONV_STREAM_HINT,
|
2026-04-03 14:44:46 +08:00
|
|
|
|
CONV_JUDGE_INSTRUCTIONS,
|
|
|
|
|
|
TURN_JUDGE_INSTRUCTIONS,
|
|
|
|
|
|
)
|
|
|
|
|
|
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
|
|
|
|
|
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
TJudgeOutput = TypeVar(
|
|
|
|
|
|
"TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-03 14:44:46 +08:00
|
|
|
|
_TURN_MAX = 768
|
2026-04-06 23:19:20 +08:00
|
|
|
|
_CONV_JUDGE_JSON_MAX = 2048
|
2026-04-08 09:38:07 +08:00
|
|
|
|
_CONV_HEADER = "【完整对话】(每轮以 `[Turn k]` 开头)\n\n"
|
2026-04-06 23:19:20 +08:00
|
|
|
|
_COMPARE_STREAM_MAX = 6144
|
2026-04-07 10:34:59 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 21:36:12 +08:00
|
|
|
|
def _eval_judge_prompt_char_pool_for_context(context_window_tokens: int) -> int:
|
2026-04-08 09:38:07 +08:00
|
|
|
|
"""整段请求的字符预算(由评审模型 context window 推导,保守)。"""
|
|
|
|
|
|
toks = (
|
2026-04-08 21:36:12 +08:00
|
|
|
|
int(context_window_tokens)
|
2026-04-08 09:38:07 +08:00
|
|
|
|
- settings.eval_judge_completion_reserve_tokens
|
|
|
|
|
|
- settings.eval_judge_prompt_budget_safety_tokens
|
|
|
|
|
|
)
|
|
|
|
|
|
toks = max(1, toks)
|
|
|
|
|
|
return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 21:36:12 +08:00
|
|
|
|
def _eval_judge_prompt_char_pool() -> int:
|
|
|
|
|
|
return _eval_judge_prompt_char_pool_for_context(
|
|
|
|
|
|
settings.eval_judge_context_window_tokens
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 09:38:07 +08:00
|
|
|
|
def eval_judge_conversation_transcript_max_chars() -> int:
|
2026-04-08 21:36:12 +08:00
|
|
|
|
"""整段对话评审:【完整对话】transcript 最大字符数(默认 GLM 上下文)。"""
|
2026-04-08 09:38:07 +08:00
|
|
|
|
if settings.eval_judge_max_transcript_chars > 0:
|
|
|
|
|
|
return settings.eval_judge_max_transcript_chars
|
|
|
|
|
|
overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
|
|
|
|
|
|
return max(1, _eval_judge_prompt_char_pool() - overhead)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 21:36:12 +08:00
|
|
|
|
def eval_judge_conversation_transcript_max_chars_for_context(
|
|
|
|
|
|
context_window_tokens: int,
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
if settings.eval_judge_max_transcript_chars > 0:
|
|
|
|
|
|
return settings.eval_judge_max_transcript_chars
|
|
|
|
|
|
overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
|
|
|
|
|
|
pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
|
|
|
|
|
|
return max(1, pool - overhead)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 09:38:07 +08:00
|
|
|
|
def eval_judge_turn_prior_transcript_max_chars() -> int:
|
2026-04-08 21:36:12 +08:00
|
|
|
|
"""逐轮评审:截至上一轮的 transcript 节选上限(默认 GLM 上下文)。"""
|
2026-04-08 09:38:07 +08:00
|
|
|
|
if settings.eval_judge_max_transcript_chars > 0:
|
|
|
|
|
|
return settings.eval_judge_max_transcript_chars
|
|
|
|
|
|
static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
|
|
|
|
|
|
return max(1, _eval_judge_prompt_char_pool() - static)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 21:36:12 +08:00
|
|
|
|
def eval_judge_turn_prior_transcript_max_chars_for_context(
|
|
|
|
|
|
context_window_tokens: int,
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
if settings.eval_judge_max_transcript_chars > 0:
|
|
|
|
|
|
return settings.eval_judge_max_transcript_chars
|
|
|
|
|
|
static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
|
|
|
|
|
|
pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
|
|
|
|
|
|
return max(1, pool - static)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 09:38:07 +08:00
|
|
|
|
def eval_judge_compare_transcript_each_max_chars() -> int:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"""单侧对称参考上限(默认与 settings.eval_judge_context_window_tokens 一致)。"""
|
|
|
|
|
|
return eval_judge_compare_transcript_each_max_chars_for_context(
|
|
|
|
|
|
settings.eval_judge_context_window_tokens
|
2026-04-08 21:36:12 +08:00
|
|
|
|
)
|
2026-04-09 15:32:35 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def eval_judge_compare_transcript_pair_total_budget_for_context(
|
|
|
|
|
|
context_window_tokens: int,
|
|
|
|
|
|
) -> int:
|
|
|
|
|
|
"""A/B 同 prompt 时,两份 transcript 合计最大字符数(已扣对比模板与双份 JSON 等开销)。"""
|
|
|
|
|
|
if settings.eval_judge_max_compare_transcript_chars_each > 0:
|
|
|
|
|
|
return max(1, 2 * int(settings.eval_judge_max_compare_transcript_chars_each))
|
|
|
|
|
|
pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
|
|
|
|
|
|
return max(1, pool - int(settings.eval_judge_compare_prompt_overhead_chars))
|
2026-04-08 21:36:12 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def eval_judge_compare_transcript_each_max_chars_for_context(
|
|
|
|
|
|
context_window_tokens: int,
|
|
|
|
|
|
) -> int:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
"""单侧对称上限的参考值(auto 模式下约为合计预算的一半;供兼容与展示)。"""
|
2026-04-08 21:36:12 +08:00
|
|
|
|
if settings.eval_judge_max_compare_transcript_chars_each > 0:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
return int(settings.eval_judge_max_compare_transcript_chars_each)
|
|
|
|
|
|
total = eval_judge_compare_transcript_pair_total_budget_for_context(
|
|
|
|
|
|
context_window_tokens
|
2026-04-08 21:36:12 +08:00
|
|
|
|
)
|
2026-04-09 15:32:35 +08:00
|
|
|
|
return max(1, total // 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def eval_judge_compare_bundle_caps(
|
|
|
|
|
|
context_window_tokens: int,
|
|
|
|
|
|
) -> tuple[int, int | None]:
|
|
|
|
|
|
"""返回 (compare_cap_total, per_side_cap|None),供 Playground 摘要与流式对比共用。"""
|
|
|
|
|
|
per = int(settings.eval_judge_max_compare_transcript_chars_each or 0)
|
|
|
|
|
|
if per > 0:
|
|
|
|
|
|
return max(1, 2 * per), per
|
|
|
|
|
|
return eval_judge_compare_transcript_pair_total_budget_for_context(
|
|
|
|
|
|
context_window_tokens
|
|
|
|
|
|
), None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def trim_compare_transcript_pair(
|
|
|
|
|
|
baseline: str,
|
|
|
|
|
|
replay: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
total_max_chars: int,
|
|
|
|
|
|
per_side_max_chars: int | None = None,
|
|
|
|
|
|
) -> tuple[str, str, bool, bool]:
|
|
|
|
|
|
"""A/B 对比 prompt 用:在合计预算内尽量保留全文;仅超长时优先从较长的一侧裁尾部。
|
|
|
|
|
|
|
|
|
|
|
|
若配置了 eval_judge_max_compare_transcript_chars_each,则仍按单侧硬顶(与旧行为一致)。
|
|
|
|
|
|
"""
|
|
|
|
|
|
b = (baseline or "").strip()
|
|
|
|
|
|
r = (replay or "").strip()
|
|
|
|
|
|
if per_side_max_chars is not None and int(per_side_max_chars) > 0:
|
|
|
|
|
|
cap = int(per_side_max_chars)
|
|
|
|
|
|
return b[:cap], r[:cap], len(b) > cap, len(r) > cap
|
|
|
|
|
|
|
|
|
|
|
|
cap_total = max(1, int(total_max_chars))
|
|
|
|
|
|
if len(b) + len(r) <= cap_total:
|
|
|
|
|
|
return b, r, False, False
|
|
|
|
|
|
|
|
|
|
|
|
need_drop = len(b) + len(r) - cap_total
|
|
|
|
|
|
b2, r2 = b, r
|
|
|
|
|
|
while need_drop > 0 and (b2 or r2):
|
|
|
|
|
|
if len(b2) >= len(r2):
|
|
|
|
|
|
if b2:
|
|
|
|
|
|
b2 = b2[:-1]
|
|
|
|
|
|
need_drop -= 1
|
|
|
|
|
|
elif r2:
|
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
need_drop -= 1
|
|
|
|
|
|
else:
|
|
|
|
|
|
break
|
|
|
|
|
|
else:
|
|
|
|
|
|
if r2:
|
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
need_drop -= 1
|
|
|
|
|
|
elif b2:
|
|
|
|
|
|
b2 = b2[:-1]
|
|
|
|
|
|
need_drop -= 1
|
|
|
|
|
|
else:
|
|
|
|
|
|
break
|
|
|
|
|
|
return b2, r2, len(b) > len(b2), len(r) > len(r2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL = (
|
|
|
|
|
|
"\n\n【评审边界——输入已为截断稿】\n"
|
|
|
|
|
|
"以上仅为全文前 {n} 个字符,其后未提供给模型。"
|
|
|
|
|
|
"对依赖长程多轮轨迹的细项(尤其 context_memory、interview_structure、跨轮重复盘问)"
|
|
|
|
|
|
"必须保守给分(倾向区间中低),并在 insufficient_evidence 写明「输入为截断稿,长程证据不足」;"
|
|
|
|
|
|
"不得臆断未展示轮次中的行为;confidence 须显著降低;禁止因未见问题而默认高分或推断后半段无缺陷。\n"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_TURN_PRIOR_TRUNCATION_TAIL = (
|
|
|
|
|
|
"\n\n【评审边界——上文节选已截断】\n"
|
|
|
|
|
|
"「截至上一轮」节选可能仅为更长对话的前 {n} 字;跨轮重复、长程结构若无法从节选核实,"
|
|
|
|
|
|
"须在 insufficient_evidence 说明,并对相关细项保守给分。\n"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_COMPARE_STREAM_PAIR_TRUNCATION_NOTE = (
|
|
|
|
|
|
"\n【评审边界】以下 A/B transcript 至少一侧为截断稿,请仅就**已展示片段**比较;"
|
|
|
|
|
|
"不得断言未展示轮次的优劣;涉及跨轮重复盘问等须明确证据范围或说不足以判断。\n"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def conversation_judge_transcript_excerpt(full_transcript: str, cap: int) -> str:
|
|
|
|
|
|
"""整段评审:在 cap 内截断时在正文后附加边界说明,减少「假装看了全文」的幻觉打分。"""
|
|
|
|
|
|
raw = (full_transcript or "").strip()
|
|
|
|
|
|
c = max(0, int(cap))
|
|
|
|
|
|
if len(raw) <= c:
|
|
|
|
|
|
return raw
|
|
|
|
|
|
return raw[:c] + _CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL.format(n=c)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def turn_judge_prior_excerpt(prior_transcript: str, cap: int) -> str:
|
|
|
|
|
|
"""逐轮评审里「截至上一轮」节选;截断时附加边界说明。"""
|
|
|
|
|
|
raw = (prior_transcript or "").strip()
|
|
|
|
|
|
c = max(0, int(cap))
|
|
|
|
|
|
if len(raw) <= c:
|
|
|
|
|
|
return raw
|
|
|
|
|
|
return raw[:c] + _TURN_PRIOR_TRUNCATION_TAIL.format(n=c)
|
2026-04-08 09:38:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
@dataclass(slots=True)
|
|
|
|
|
|
class JudgeCallResult(Generic[TJudgeOutput]):
|
|
|
|
|
|
output: TJudgeOutput | None
|
|
|
|
|
|
error: str | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _judge_error_message(e: LLMCallError) -> str:
|
|
|
|
|
|
prefix = {
|
|
|
|
|
|
"invoke": "模型调用失败",
|
|
|
|
|
|
"decode": "JSON 解析失败",
|
|
|
|
|
|
"validation": "结果校验失败",
|
|
|
|
|
|
}.get(e.kind, "评审失败")
|
|
|
|
|
|
detail = str(e).strip()
|
|
|
|
|
|
return f"{prefix}: {detail}" if detail else prefix
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
def _build_memoir_judge_prompt(
|
|
|
|
|
|
*,
|
|
|
|
|
|
memoir_markdown: str,
|
|
|
|
|
|
source_transcript: str = "",
|
2026-04-08 15:37:09 +08:00
|
|
|
|
structured_evidence: str = "",
|
2026-04-07 10:34:59 +08:00
|
|
|
|
reference_memoir_markdown: str = "",
|
|
|
|
|
|
evidence_notes: str = "",
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
"""Assemble an evidence-aware memoir judging prompt."""
|
|
|
|
|
|
source = (source_transcript or "").strip()
|
2026-04-08 15:37:09 +08:00
|
|
|
|
struct = (structured_evidence or "").strip()
|
2026-04-07 10:34:59 +08:00
|
|
|
|
reference = (reference_memoir_markdown or "").strip()
|
|
|
|
|
|
notes = (evidence_notes or "").strip()
|
2026-04-07 17:15:01 +08:00
|
|
|
|
sections = [
|
|
|
|
|
|
MEMOIR_JUDGE_INSTRUCTIONS,
|
|
|
|
|
|
"",
|
2026-04-08 15:37:09 +08:00
|
|
|
|
"【证据与输入顺序】以下区块按优先级给出:"
|
|
|
|
|
|
"评审说明(若有)→ 原始访谈/对话证据(segment 绑定)→ 结构化记忆证据(chunk/fact/timeline/summary)"
|
|
|
|
|
|
"→ 参考基线(若有)→ 待评成稿。**真实性、覆盖率、可追溯性以「artifact 绑定证据闭包」为准**;"
|
2026-04-30 14:11:46 +08:00
|
|
|
|
"若证据不足,须保守打分并写 `insufficient_evidence`。",
|
2026-04-07 17:15:01 +08:00
|
|
|
|
"",
|
|
|
|
|
|
]
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
ev_cap = max(1, int(settings.eval_judge_memoir_evidence_max_chars))
|
|
|
|
|
|
body_cap = max(1, int(settings.eval_judge_memoir_body_max_chars))
|
2026-04-07 10:34:59 +08:00
|
|
|
|
if notes:
|
|
|
|
|
|
sections.extend(["【评审说明】", notes[:1200], ""])
|
|
|
|
|
|
if source:
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
sections.extend(["【原始访谈/对话证据】", source[:ev_cap], ""])
|
2026-04-07 10:34:59 +08:00
|
|
|
|
else:
|
|
|
|
|
|
sections.extend(
|
|
|
|
|
|
[
|
2026-04-08 15:37:09 +08:00
|
|
|
|
"【原始访谈/对话证据】",
|
|
|
|
|
|
"无可用局部对话证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
|
|
|
|
|
|
"",
|
|
|
|
|
|
]
|
|
|
|
|
|
)
|
|
|
|
|
|
if struct:
|
2026-04-10 10:28:41 +08:00
|
|
|
|
sections.extend(["【结构化记忆证据】", struct[:ev_cap], ""])
|
2026-04-08 15:37:09 +08:00
|
|
|
|
else:
|
|
|
|
|
|
sections.extend(
|
|
|
|
|
|
[
|
|
|
|
|
|
"【结构化记忆证据】",
|
|
|
|
|
|
"(本 artifact 未绑定或未解析到 chunk/fact/timeline/summary 证据。)",
|
2026-04-07 10:34:59 +08:00
|
|
|
|
"",
|
|
|
|
|
|
]
|
|
|
|
|
|
)
|
|
|
|
|
|
if reference:
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
sections.extend(["【参考基线/导出成稿】", reference[:ev_cap], ""])
|
|
|
|
|
|
sections.extend(["【当前回忆录正文】", memoir_markdown[:body_cap]])
|
2026-04-07 10:34:59 +08:00
|
|
|
|
return "\n".join(sections)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EvalJudgeService:
|
2026-04-08 21:36:12 +08:00
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
judge_llm: Any | None,
|
|
|
|
|
|
*,
|
|
|
|
|
|
context_window_tokens: int | None = None,
|
2026-04-27 14:34:30 +08:00
|
|
|
|
http_error_vendor: EvalJudgeProvider = "deepseek",
|
2026-04-08 21:36:12 +08:00
|
|
|
|
) -> None:
|
2026-04-03 14:44:46 +08:00
|
|
|
|
self._llm = judge_llm
|
2026-04-27 14:34:30 +08:00
|
|
|
|
self._http_error_vendor: EvalJudgeProvider = http_error_vendor
|
2026-04-08 21:36:12 +08:00
|
|
|
|
self._ctx_tokens = int(
|
|
|
|
|
|
context_window_tokens or settings.eval_judge_context_window_tokens
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _conv_transcript_cap(self) -> int:
|
|
|
|
|
|
return eval_judge_conversation_transcript_max_chars_for_context(
|
|
|
|
|
|
self._ctx_tokens
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _turn_prior_cap(self) -> int:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
return eval_judge_turn_prior_transcript_max_chars_for_context(self._ctx_tokens)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
|
|
async def judge_turn(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
prior_transcript: str,
|
|
|
|
|
|
user_utterance: str,
|
|
|
|
|
|
assistant_reply: str,
|
2026-04-07 17:15:01 +08:00
|
|
|
|
turn_index_0: int = 0,
|
2026-04-03 14:44:46 +08:00
|
|
|
|
) -> TurnJudgeOutput | None:
|
|
|
|
|
|
if not self._llm:
|
|
|
|
|
|
return None
|
2026-04-07 17:15:01 +08:00
|
|
|
|
t = max(0, int(turn_index_0))
|
2026-04-03 14:44:46 +08:00
|
|
|
|
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致)。evidence_refs.turn_index 请使用该编号。
|
|
|
|
|
|
|
|
|
|
|
|
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
|
2026-04-09 15:32:35 +08:00
|
|
|
|
{turn_judge_prior_excerpt(prior_transcript, self._turn_prior_cap())}
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
|
|
【本轮用户】
|
|
|
|
|
|
{user_utterance[:4000]}
|
|
|
|
|
|
|
|
|
|
|
|
【本轮 AI】
|
|
|
|
|
|
{assistant_reply[:4000]}
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
return await allm_json_call(
|
|
|
|
|
|
self._llm,
|
|
|
|
|
|
prompt,
|
|
|
|
|
|
TurnJudgeOutput,
|
|
|
|
|
|
max_tokens=_TURN_MAX,
|
|
|
|
|
|
agent="EvalJudgeService.judge_turn",
|
2026-04-27 14:34:30 +08:00
|
|
|
|
http_error_vendor=self._http_error_vendor,
|
2026-04-03 14:44:46 +08:00
|
|
|
|
)
|
|
|
|
|
|
except LLMCallError as e:
|
|
|
|
|
|
logger.warning("turn judge failed: {}", e)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
async def judge_conversation_result(
|
2026-04-03 14:44:46 +08:00
|
|
|
|
self, *, full_transcript: str
|
2026-04-07 17:15:01 +08:00
|
|
|
|
) -> JudgeCallResult[ConversationJudgeOutput]:
|
2026-04-03 14:44:46 +08:00
|
|
|
|
if not self._llm:
|
2026-04-08 21:36:12 +08:00
|
|
|
|
return JudgeCallResult(
|
|
|
|
|
|
output=None,
|
|
|
|
|
|
error="评审模型未配置(智谱或 DeepSeek 密钥)",
|
|
|
|
|
|
)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
【完整对话】(每轮以 `[Turn k]` 开头)
|
2026-04-09 15:32:35 +08:00
|
|
|
|
{conversation_judge_transcript_excerpt(full_transcript, self._conv_transcript_cap())}
|
2026-04-03 14:44:46 +08:00
|
|
|
|
"""
|
|
|
|
|
|
try:
|
2026-04-07 17:15:01 +08:00
|
|
|
|
out = await allm_json_call(
|
2026-04-03 14:44:46 +08:00
|
|
|
|
self._llm,
|
|
|
|
|
|
prompt,
|
|
|
|
|
|
ConversationJudgeOutput,
|
2026-04-06 23:19:20 +08:00
|
|
|
|
max_tokens=_CONV_JUDGE_JSON_MAX,
|
2026-04-03 14:44:46 +08:00
|
|
|
|
agent="EvalJudgeService.judge_conversation",
|
2026-04-27 14:34:30 +08:00
|
|
|
|
http_error_vendor=self._http_error_vendor,
|
2026-04-03 14:44:46 +08:00
|
|
|
|
)
|
2026-04-07 17:15:01 +08:00
|
|
|
|
return JudgeCallResult(output=out)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
except LLMCallError as e:
|
2026-04-07 17:15:01 +08:00
|
|
|
|
error = _judge_error_message(e)
|
|
|
|
|
|
logger.warning("conversation judge failed: {}", error)
|
|
|
|
|
|
return JudgeCallResult(output=None, error=error)
|
|
|
|
|
|
|
|
|
|
|
|
async def judge_conversation(
|
|
|
|
|
|
self, *, full_transcript: str
|
|
|
|
|
|
) -> ConversationJudgeOutput | None:
|
|
|
|
|
|
result = await self.judge_conversation_result(full_transcript=full_transcript)
|
|
|
|
|
|
return result.output
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
2026-04-06 23:19:20 +08:00
|
|
|
|
async def stream_conversation_compare(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
baseline_transcript: str,
|
|
|
|
|
|
replay_transcript: str,
|
|
|
|
|
|
baseline_judge: ConversationJudgeOutput | None,
|
|
|
|
|
|
replay_judge: ConversationJudgeOutput | None,
|
|
|
|
|
|
) -> AsyncIterator[str]:
|
|
|
|
|
|
"""流式输出中文对比与建议(非 JSON)。"""
|
|
|
|
|
|
if not self._llm:
|
2026-04-08 21:36:12 +08:00
|
|
|
|
yield "[错误] 未配置评审模型 API Key(智谱:eval_judge_api_key / zhipu_api_key;DeepSeek:deepseek_api_key)"
|
2026-04-06 23:19:20 +08:00
|
|
|
|
return
|
2026-04-09 15:32:35 +08:00
|
|
|
|
cap_total, per_side = eval_judge_compare_bundle_caps(self._ctx_tokens)
|
2026-04-08 21:36:12 +08:00
|
|
|
|
cap_single = self._conv_transcript_cap()
|
2026-04-09 15:32:35 +08:00
|
|
|
|
b_tr, r_tr, b_cmp_trunc, r_cmp_trunc = trim_compare_transcript_pair(
|
|
|
|
|
|
baseline_transcript or "",
|
|
|
|
|
|
replay_transcript or "",
|
|
|
|
|
|
total_max_chars=cap_total,
|
|
|
|
|
|
per_side_max_chars=per_side,
|
|
|
|
|
|
)
|
|
|
|
|
|
compare_pair_truncated = b_cmp_trunc or r_cmp_trunc
|
2026-04-06 23:19:20 +08:00
|
|
|
|
b_json = (
|
|
|
|
|
|
baseline_judge.model_dump_json(ensure_ascii=False)
|
|
|
|
|
|
if baseline_judge
|
|
|
|
|
|
else "null"
|
|
|
|
|
|
)
|
|
|
|
|
|
r_json = (
|
|
|
|
|
|
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
|
|
|
|
|
|
)
|
|
|
|
|
|
if baseline_judge and replay_judge:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
trunc_line = (
|
|
|
|
|
|
_COMPARE_STREAM_PAIR_TRUNCATION_NOTE if compare_pair_truncated else ""
|
|
|
|
|
|
)
|
|
|
|
|
|
prompt = f"""你是访谈对话评测专家。下面给出两份对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块):
|
2026-04-06 23:19:20 +08:00
|
|
|
|
|
|
|
|
|
|
【A:导出基准对话】(历史快照:用户与当时导出的线上 AI,多轮合并为一篇)
|
|
|
|
|
|
{b_tr}
|
|
|
|
|
|
|
|
|
|
|
|
【B:本次回放/新测对话】(用户句与基准对齐,AI 为当前后端重新生成)
|
|
|
|
|
|
{r_tr}
|
2026-04-09 15:32:35 +08:00
|
|
|
|
{trunc_line}
|
2026-04-06 23:19:20 +08:00
|
|
|
|
【A 的整体评分 JSON】
|
|
|
|
|
|
{b_json}
|
|
|
|
|
|
|
|
|
|
|
|
【B 的整体评分 JSON】
|
|
|
|
|
|
{r_json}
|
|
|
|
|
|
|
|
|
|
|
|
请依次撰写:
|
2026-04-07 10:34:59 +08:00
|
|
|
|
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
|
2026-04-06 23:19:20 +08:00
|
|
|
|
2) B 相对 A 的优点与不足;
|
|
|
|
|
|
3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。
|
|
|
|
|
|
|
|
|
|
|
|
笔调简洁、偏执行清单。"""
|
|
|
|
|
|
elif replay_judge:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
r_one = conversation_judge_transcript_excerpt(
|
|
|
|
|
|
replay_transcript or "", cap_single
|
|
|
|
|
|
)
|
2026-04-06 23:19:20 +08:00
|
|
|
|
prompt = f"""{COMPARE_CONV_STREAM_HINT}
|
|
|
|
|
|
|
|
|
|
|
|
【回放/新测 transcript】
|
2026-04-08 09:38:07 +08:00
|
|
|
|
{r_one}
|
2026-04-06 23:19:20 +08:00
|
|
|
|
|
|
|
|
|
|
【整体评分 JSON】
|
|
|
|
|
|
{r_json}
|
|
|
|
|
|
"""
|
|
|
|
|
|
else:
|
|
|
|
|
|
yield "[错误] 缺少回放对话评分,无法生成建议"
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
llm = self._llm
|
|
|
|
|
|
if hasattr(llm, "bind"):
|
|
|
|
|
|
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
|
|
|
|
|
|
try:
|
|
|
|
|
|
async for chunk in llm.astream(prompt):
|
|
|
|
|
|
piece = getattr(chunk, "content", None)
|
|
|
|
|
|
if piece:
|
|
|
|
|
|
yield piece
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning("conversation compare stream failed: {}", e)
|
|
|
|
|
|
yield f"\n\n[流式输出中断:{e}]"
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
async def judge_memoir(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
memoir_markdown: str,
|
|
|
|
|
|
source_transcript: str = "",
|
2026-04-08 15:37:09 +08:00
|
|
|
|
structured_evidence: str = "",
|
2026-04-07 10:34:59 +08:00
|
|
|
|
reference_memoir_markdown: str = "",
|
|
|
|
|
|
evidence_notes: str = "",
|
|
|
|
|
|
) -> MemoirJudgeOutput | None:
|
2026-04-07 17:15:01 +08:00
|
|
|
|
result = await self.judge_memoir_result(
|
|
|
|
|
|
memoir_markdown=memoir_markdown,
|
|
|
|
|
|
source_transcript=source_transcript,
|
2026-04-08 15:37:09 +08:00
|
|
|
|
structured_evidence=structured_evidence,
|
2026-04-07 17:15:01 +08:00
|
|
|
|
reference_memoir_markdown=reference_memoir_markdown,
|
|
|
|
|
|
evidence_notes=evidence_notes,
|
|
|
|
|
|
)
|
|
|
|
|
|
return result.output
|
|
|
|
|
|
|
|
|
|
|
|
async def judge_memoir_result(
|
|
|
|
|
|
self,
|
|
|
|
|
|
*,
|
|
|
|
|
|
memoir_markdown: str,
|
|
|
|
|
|
source_transcript: str = "",
|
2026-04-08 15:37:09 +08:00
|
|
|
|
structured_evidence: str = "",
|
2026-04-07 17:15:01 +08:00
|
|
|
|
reference_memoir_markdown: str = "",
|
|
|
|
|
|
evidence_notes: str = "",
|
|
|
|
|
|
) -> JudgeCallResult[MemoirJudgeOutput]:
|
2026-04-03 14:44:46 +08:00
|
|
|
|
if not self._llm:
|
2026-04-08 21:36:12 +08:00
|
|
|
|
return JudgeCallResult(
|
|
|
|
|
|
output=None,
|
|
|
|
|
|
error="评审模型未配置(智谱或 DeepSeek 密钥)",
|
|
|
|
|
|
)
|
2026-04-07 10:34:59 +08:00
|
|
|
|
prompt = _build_memoir_judge_prompt(
|
|
|
|
|
|
memoir_markdown=memoir_markdown,
|
|
|
|
|
|
source_transcript=source_transcript,
|
2026-04-08 15:37:09 +08:00
|
|
|
|
structured_evidence=structured_evidence,
|
2026-04-07 10:34:59 +08:00
|
|
|
|
reference_memoir_markdown=reference_memoir_markdown,
|
|
|
|
|
|
evidence_notes=evidence_notes,
|
|
|
|
|
|
)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
try:
|
2026-04-07 17:15:01 +08:00
|
|
|
|
out = await allm_json_call(
|
2026-04-03 14:44:46 +08:00
|
|
|
|
self._llm,
|
|
|
|
|
|
prompt,
|
|
|
|
|
|
MemoirJudgeOutput,
|
2026-04-10 20:35:57 +08:00
|
|
|
|
max_tokens=max(
|
|
|
|
|
|
512, int(settings.eval_judge_memoir_completion_max_tokens)
|
|
|
|
|
|
),
|
2026-04-03 14:44:46 +08:00
|
|
|
|
agent="EvalJudgeService.judge_memoir",
|
2026-04-27 14:34:30 +08:00
|
|
|
|
http_error_vendor=self._http_error_vendor,
|
2026-04-03 14:44:46 +08:00
|
|
|
|
)
|
2026-04-07 17:15:01 +08:00
|
|
|
|
return JudgeCallResult(output=out)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
except LLMCallError as e:
|
2026-04-07 17:15:01 +08:00
|
|
|
|
error = _judge_error_message(e)
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
# 回忆录评审在 INFO 也要可见(eval-web 排障);非异常路径、不刷堆栈
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
"event=eval_memoir_judge_llm_call_failed agent=EvalJudgeService.judge_memoir msg={}",
|
|
|
|
|
|
error,
|
|
|
|
|
|
)
|
2026-04-07 17:15:01 +08:00
|
|
|
|
return JudgeCallResult(output=None, error=error)
|