feat:
1. 建立问题库大纲,对应每个人生阶段槽位 2. 鼓励使用更生活化的交流语言共情与总结 3. 降低评审模型可能发生截断的概率 4. 成稿质量维度强化情感表达和上下文连贯性
This commit is contained in:
@@ -90,26 +90,127 @@ def eval_judge_turn_prior_transcript_max_chars_for_context(
|
||||
|
||||
|
||||
def eval_judge_compare_transcript_each_max_chars() -> int:
|
||||
"""A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(默认 GLM 上下文)。"""
|
||||
if settings.eval_judge_max_compare_transcript_chars_each > 0:
|
||||
return settings.eval_judge_max_compare_transcript_chars_each
|
||||
pool = (
|
||||
_eval_judge_prompt_char_pool()
|
||||
- settings.eval_judge_compare_prompt_overhead_chars
|
||||
"""单侧对称参考上限(默认与 settings.eval_judge_context_window_tokens 一致)。"""
|
||||
return eval_judge_compare_transcript_each_max_chars_for_context(
|
||||
settings.eval_judge_context_window_tokens
|
||||
)
|
||||
return max(1, pool // 2)
|
||||
|
||||
|
||||
def eval_judge_compare_transcript_pair_total_budget_for_context(
|
||||
context_window_tokens: int,
|
||||
) -> int:
|
||||
"""A/B 同 prompt 时,两份 transcript 合计最大字符数(已扣对比模板与双份 JSON 等开销)。"""
|
||||
if settings.eval_judge_max_compare_transcript_chars_each > 0:
|
||||
return max(1, 2 * int(settings.eval_judge_max_compare_transcript_chars_each))
|
||||
pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
|
||||
return max(1, pool - int(settings.eval_judge_compare_prompt_overhead_chars))
|
||||
|
||||
|
||||
def eval_judge_compare_transcript_each_max_chars_for_context(
|
||||
context_window_tokens: int,
|
||||
) -> int:
|
||||
"""单侧对称上限的参考值(auto 模式下约为合计预算的一半;供兼容与展示)。"""
|
||||
if settings.eval_judge_max_compare_transcript_chars_each > 0:
|
||||
return settings.eval_judge_max_compare_transcript_chars_each
|
||||
pool = (
|
||||
_eval_judge_prompt_char_pool_for_context(context_window_tokens)
|
||||
- settings.eval_judge_compare_prompt_overhead_chars
|
||||
return int(settings.eval_judge_max_compare_transcript_chars_each)
|
||||
total = eval_judge_compare_transcript_pair_total_budget_for_context(
|
||||
context_window_tokens
|
||||
)
|
||||
return max(1, pool // 2)
|
||||
return max(1, total // 2)
|
||||
|
||||
|
||||
def eval_judge_compare_bundle_caps(
|
||||
context_window_tokens: int,
|
||||
) -> tuple[int, int | None]:
|
||||
"""返回 (compare_cap_total, per_side_cap|None),供 Playground 摘要与流式对比共用。"""
|
||||
per = int(settings.eval_judge_max_compare_transcript_chars_each or 0)
|
||||
if per > 0:
|
||||
return max(1, 2 * per), per
|
||||
return eval_judge_compare_transcript_pair_total_budget_for_context(
|
||||
context_window_tokens
|
||||
), None
|
||||
|
||||
|
||||
def trim_compare_transcript_pair(
|
||||
baseline: str,
|
||||
replay: str,
|
||||
*,
|
||||
total_max_chars: int,
|
||||
per_side_max_chars: int | None = None,
|
||||
) -> tuple[str, str, bool, bool]:
|
||||
"""A/B 对比 prompt 用:在合计预算内尽量保留全文;仅超长时优先从较长的一侧裁尾部。
|
||||
|
||||
若配置了 eval_judge_max_compare_transcript_chars_each,则仍按单侧硬顶(与旧行为一致)。
|
||||
"""
|
||||
b = (baseline or "").strip()
|
||||
r = (replay or "").strip()
|
||||
if per_side_max_chars is not None and int(per_side_max_chars) > 0:
|
||||
cap = int(per_side_max_chars)
|
||||
return b[:cap], r[:cap], len(b) > cap, len(r) > cap
|
||||
|
||||
cap_total = max(1, int(total_max_chars))
|
||||
if len(b) + len(r) <= cap_total:
|
||||
return b, r, False, False
|
||||
|
||||
need_drop = len(b) + len(r) - cap_total
|
||||
b2, r2 = b, r
|
||||
while need_drop > 0 and (b2 or r2):
|
||||
if len(b2) >= len(r2):
|
||||
if b2:
|
||||
b2 = b2[:-1]
|
||||
need_drop -= 1
|
||||
elif r2:
|
||||
r2 = r2[:-1]
|
||||
need_drop -= 1
|
||||
else:
|
||||
break
|
||||
else:
|
||||
if r2:
|
||||
r2 = r2[:-1]
|
||||
need_drop -= 1
|
||||
elif b2:
|
||||
b2 = b2[:-1]
|
||||
need_drop -= 1
|
||||
else:
|
||||
break
|
||||
return b2, r2, len(b) > len(b2), len(r) > len(r2)
|
||||
|
||||
|
||||
_CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL = (
|
||||
"\n\n【评审边界——输入已为截断稿】\n"
|
||||
"以上仅为全文前 {n} 个字符,其后未提供给模型。"
|
||||
"对依赖长程多轮轨迹的细项(尤其 context_memory、interview_structure、跨轮重复盘问)"
|
||||
"必须保守给分(倾向区间中低),并在 insufficient_evidence 写明「输入为截断稿,长程证据不足」;"
|
||||
"不得臆断未展示轮次中的行为;confidence 须显著降低;禁止因未见问题而默认高分或推断后半段无缺陷。\n"
|
||||
)
|
||||
|
||||
_TURN_PRIOR_TRUNCATION_TAIL = (
|
||||
"\n\n【评审边界——上文节选已截断】\n"
|
||||
"「截至上一轮」节选可能仅为更长对话的前 {n} 字;跨轮重复、长程结构若无法从节选核实,"
|
||||
"须在 insufficient_evidence 说明,并对相关细项保守给分。\n"
|
||||
)
|
||||
|
||||
_COMPARE_STREAM_PAIR_TRUNCATION_NOTE = (
|
||||
"\n【评审边界】以下 A/B transcript 至少一侧为截断稿,请仅就**已展示片段**比较;"
|
||||
"不得断言未展示轮次的优劣;涉及跨轮重复盘问等须明确证据范围或说不足以判断。\n"
|
||||
)
|
||||
|
||||
|
||||
def conversation_judge_transcript_excerpt(full_transcript: str, cap: int) -> str:
|
||||
"""整段评审:在 cap 内截断时在正文后附加边界说明,减少「假装看了全文」的幻觉打分。"""
|
||||
raw = (full_transcript or "").strip()
|
||||
c = max(0, int(cap))
|
||||
if len(raw) <= c:
|
||||
return raw
|
||||
return raw[:c] + _CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL.format(n=c)
|
||||
|
||||
|
||||
def turn_judge_prior_excerpt(prior_transcript: str, cap: int) -> str:
|
||||
"""逐轮评审里「截至上一轮」节选;截断时附加边界说明。"""
|
||||
raw = (prior_transcript or "").strip()
|
||||
c = max(0, int(cap))
|
||||
if len(raw) <= c:
|
||||
return raw
|
||||
return raw[:c] + _TURN_PRIOR_TRUNCATION_TAIL.format(n=c)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@@ -163,9 +264,7 @@ def _build_memoir_judge_prompt(
|
||||
]
|
||||
)
|
||||
if struct:
|
||||
sections.extend(
|
||||
["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""]
|
||||
)
|
||||
sections.extend(["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""])
|
||||
else:
|
||||
sections.extend(
|
||||
[
|
||||
@@ -198,14 +297,7 @@ class EvalJudgeService:
|
||||
)
|
||||
|
||||
def _turn_prior_cap(self) -> int:
|
||||
return eval_judge_turn_prior_transcript_max_chars_for_context(
|
||||
self._ctx_tokens
|
||||
)
|
||||
|
||||
def _compare_each_cap(self) -> int:
|
||||
return eval_judge_compare_transcript_each_max_chars_for_context(
|
||||
self._ctx_tokens
|
||||
)
|
||||
return eval_judge_turn_prior_transcript_max_chars_for_context(self._ctx_tokens)
|
||||
|
||||
async def judge_turn(
|
||||
self,
|
||||
@@ -223,7 +315,7 @@ class EvalJudgeService:
|
||||
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致)。evidence_refs.turn_index 请使用该编号。
|
||||
|
||||
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
|
||||
{prior_transcript[: self._turn_prior_cap()]}
|
||||
{turn_judge_prior_excerpt(prior_transcript, self._turn_prior_cap())}
|
||||
|
||||
【本轮用户】
|
||||
{user_utterance[:4000]}
|
||||
@@ -254,7 +346,7 @@ class EvalJudgeService:
|
||||
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
|
||||
|
||||
【完整对话】(每轮以 `[Turn k]` 开头)
|
||||
{full_transcript[: self._conv_transcript_cap()]}
|
||||
{conversation_judge_transcript_excerpt(full_transcript, self._conv_transcript_cap())}
|
||||
"""
|
||||
try:
|
||||
out = await allm_json_call(
|
||||
@@ -288,10 +380,15 @@ class EvalJudgeService:
|
||||
if not self._llm:
|
||||
yield "[错误] 未配置评审模型 API Key(智谱:eval_judge_api_key / zhipu_api_key;DeepSeek:deepseek_api_key)"
|
||||
return
|
||||
cap_each = self._compare_each_cap()
|
||||
cap_total, per_side = eval_judge_compare_bundle_caps(self._ctx_tokens)
|
||||
cap_single = self._conv_transcript_cap()
|
||||
b_tr = (baseline_transcript or "").strip()[:cap_each]
|
||||
r_tr = (replay_transcript or "").strip()[:cap_each]
|
||||
b_tr, r_tr, b_cmp_trunc, r_cmp_trunc = trim_compare_transcript_pair(
|
||||
baseline_transcript or "",
|
||||
replay_transcript or "",
|
||||
total_max_chars=cap_total,
|
||||
per_side_max_chars=per_side,
|
||||
)
|
||||
compare_pair_truncated = b_cmp_trunc or r_cmp_trunc
|
||||
b_json = (
|
||||
baseline_judge.model_dump_json(ensure_ascii=False)
|
||||
if baseline_judge
|
||||
@@ -301,14 +398,17 @@ class EvalJudgeService:
|
||||
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
|
||||
)
|
||||
if baseline_judge and replay_judge:
|
||||
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块):
|
||||
trunc_line = (
|
||||
_COMPARE_STREAM_PAIR_TRUNCATION_NOTE if compare_pair_truncated else ""
|
||||
)
|
||||
prompt = f"""你是访谈对话评测专家。下面给出两份对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块):
|
||||
|
||||
【A:导出基准对话】(历史快照:用户与当时导出的线上 AI,多轮合并为一篇)
|
||||
{b_tr}
|
||||
|
||||
【B:本次回放/新测对话】(用户句与基准对齐,AI 为当前后端重新生成)
|
||||
{r_tr}
|
||||
|
||||
{trunc_line}
|
||||
【A 的整体评分 JSON】
|
||||
{b_json}
|
||||
|
||||
@@ -322,7 +422,9 @@ class EvalJudgeService:
|
||||
|
||||
笔调简洁、偏执行清单。"""
|
||||
elif replay_judge:
|
||||
r_one = (replay_transcript or "").strip()[:cap_single]
|
||||
r_one = conversation_judge_transcript_excerpt(
|
||||
replay_transcript or "", cap_single
|
||||
)
|
||||
prompt = f"""{COMPARE_CONV_STREAM_HINT}
|
||||
|
||||
【回放/新测 transcript】
|
||||
|
||||
Reference in New Issue
Block a user