feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整
- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total - judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分 - 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断) - 访谈打分仍为情绪强化版 15 细项、总分 100 - 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑 - 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
@@ -26,6 +26,8 @@ logger = get_logger(__name__)
|
||||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||||
_MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
_MAX_EVIDENCE_CONVERSATIONS = 8
|
||||
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
||||
|
||||
|
||||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||||
@@ -56,6 +58,50 @@ def _assistant_text_for_eval_display(raw: str) -> str:
|
||||
return (raw or "").replace("[SPLIT]", "\n")
|
||||
|
||||
|
||||
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
return s
|
||||
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
|
||||
|
||||
|
||||
def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
|
||||
parts: list[str] = []
|
||||
for role, content in pairs:
|
||||
body = (content or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
label = "用户" if role == "human" else "AI"
|
||||
out = _assistant_text_for_eval_display(body) if role != "human" else body
|
||||
parts.append(f"{label}: {out}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def _conversation_transcript_for_eval(
|
||||
db: AsyncSession, conversation_id: str
|
||||
) -> str:
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
|
||||
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
||||
return _dialogue_transcript_from_pairs(
|
||||
[(str(row.role or "").lower(), str(row.content or "")) for row in rows]
|
||||
)
|
||||
|
||||
|
||||
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
|
||||
conversations = await conversation_repo.get_user_conversations(user_id, db)
|
||||
if not conversations:
|
||||
return ""
|
||||
parts: list[str] = []
|
||||
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
|
||||
transcript = await _conversation_transcript_for_eval(db, str(conv.id))
|
||||
if transcript:
|
||||
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
|
||||
return _trim_evidence_text("\n\n".join(parts))
|
||||
|
||||
|
||||
async def execute_eval_run(
|
||||
db: AsyncSession,
|
||||
*,
|
||||
@@ -150,7 +196,7 @@ async def execute_eval_run(
|
||||
rationale = tj.rationale if tj else None
|
||||
await eval_repo.add_turn(
|
||||
db,
|
||||
run_id=run.id,
|
||||
run_id=str(run.id),
|
||||
turn_index=idx,
|
||||
user_utterance=u,
|
||||
assistant_reply=replies[idx],
|
||||
@@ -166,11 +212,36 @@ async def execute_eval_run(
|
||||
conv_total = conv_out.total_score if conv_out else None
|
||||
|
||||
memoir_md = simple_memoir_from_transcript(utterances, replies)
|
||||
mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
|
||||
source_transcript = _trim_evidence_text(full_transcript)
|
||||
reference_memoir = (case.reference_memoir_markdown or "").strip()
|
||||
mem_out = await judge.judge_memoir(
|
||||
memoir_markdown=memoir_md,
|
||||
source_transcript=source_transcript,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes="严格按文档核对真实性、覆盖率、可追溯性;以原始访谈为主,参考基线仅作辅助。",
|
||||
)
|
||||
|
||||
chapter_entries: list[dict[str, Any]] = []
|
||||
story_entries: list[dict[str, Any]] = []
|
||||
uid = (case.source_user_id or "").strip()
|
||||
source_conversation_id = (case.source_conversation_id or "").strip()
|
||||
evidence_transcript = source_transcript
|
||||
if source_conversation_id:
|
||||
try:
|
||||
conversation_evidence = await _conversation_transcript_for_eval(
|
||||
db, source_conversation_id
|
||||
)
|
||||
if conversation_evidence:
|
||||
evidence_transcript = _trim_evidence_text(conversation_evidence)
|
||||
except Exception as e:
|
||||
logger.warning("eval source conversation evidence skipped: {}", e)
|
||||
elif uid:
|
||||
try:
|
||||
user_evidence = await _user_transcript_evidence(db, uid)
|
||||
if user_evidence:
|
||||
evidence_transcript = user_evidence
|
||||
except Exception as e:
|
||||
logger.warning("eval user transcript evidence skipped: {}", e)
|
||||
if uid:
|
||||
from app.features.memoir.repo import get_chapters_for_memoir_list
|
||||
from app.features.story.repo import get_stories_for_user
|
||||
@@ -184,7 +255,14 @@ async def execute_eval_run(
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
||||
cj = await judge.judge_memoir(memoir_markdown=md)
|
||||
cj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
),
|
||||
)
|
||||
chapter_entries.append(
|
||||
{
|
||||
"id": ch.id,
|
||||
@@ -203,7 +281,14 @@ async def execute_eval_run(
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||||
sj = await judge.judge_memoir(memoir_markdown=md)
|
||||
sj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
reference_memoir_markdown=reference_memoir,
|
||||
evidence_notes=(
|
||||
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
|
||||
),
|
||||
)
|
||||
story_entries.append(
|
||||
{
|
||||
"id": st.id,
|
||||
@@ -228,8 +313,12 @@ async def execute_eval_run(
|
||||
mem_parts.append(float(j["total_score"]))
|
||||
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
|
||||
|
||||
exp = await eval_repo.get_experiment(db, run.experiment_id)
|
||||
weights = exp.composite_weights_json if exp else None
|
||||
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
|
||||
weights = (
|
||||
exp.composite_weights_json
|
||||
if exp and isinstance(exp.composite_weights_json, dict)
|
||||
else None
|
||||
)
|
||||
comp = _composite(conv_total, mem_total, weights)
|
||||
|
||||
bundle: dict[str, Any] = {
|
||||
@@ -257,13 +346,13 @@ async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> Non
|
||||
exp = await eval_repo.get_experiment(db, experiment_id)
|
||||
if not exp:
|
||||
return
|
||||
cases = await eval_repo.list_cases(db, exp.regression_set_id)
|
||||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||||
|
||||
incomplete = [r for r in runs if r.status not in ("completed", "failed")]
|
||||
incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
|
||||
if incomplete:
|
||||
return
|
||||
|
||||
failed = [r for r in runs if r.status == "failed"]
|
||||
failed = [r for r in runs if str(r.status) == "failed"]
|
||||
if failed:
|
||||
await eval_repo.update_experiment(
|
||||
db,
|
||||
@@ -301,10 +390,10 @@ async def execute_experiment_full(experiment_id: str) -> None:
|
||||
await eval_repo.update_experiment(db, exp, status="running")
|
||||
await db.commit()
|
||||
|
||||
cases = await eval_repo.list_cases(db, exp.regression_set_id)
|
||||
base_v = await eval_repo.get_version(db, exp.baseline_version_id)
|
||||
cand_v = await eval_repo.get_version(db, exp.candidate_version_id)
|
||||
if not base_v or not cand_v:
|
||||
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
|
||||
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
|
||||
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
|
||||
if base_v is None or cand_v is None:
|
||||
await eval_repo.update_experiment(
|
||||
db,
|
||||
exp,
|
||||
@@ -317,12 +406,12 @@ async def execute_experiment_full(experiment_id: str) -> None:
|
||||
|
||||
for case in cases:
|
||||
for side, ver in ("baseline", base_v), ("candidate", cand_v):
|
||||
run = await eval_repo.get_run(db, experiment_id, case.id, side)
|
||||
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
|
||||
if not run:
|
||||
run = await eval_repo.create_run(
|
||||
db,
|
||||
experiment_id=experiment_id,
|
||||
case_id=case.id,
|
||||
case_id=str(case.id),
|
||||
side=side,
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
@@ -10,6 +10,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.dependencies import get_eval_judge_langchain_llm
|
||||
from app.core.logging import get_logger
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
@@ -27,6 +28,8 @@ logger = get_logger(__name__)
|
||||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||||
_MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
_MAX_EVIDENCE_CONVERSATIONS = 8
|
||||
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
||||
|
||||
|
||||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||||
@@ -48,6 +51,41 @@ def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
return s
|
||||
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
|
||||
|
||||
|
||||
async def _conversation_transcript_for_manual(
|
||||
db: AsyncSession, conversation_id: str
|
||||
) -> str:
|
||||
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
||||
parts: list[str] = []
|
||||
for row in rows:
|
||||
role = (row.role or "").lower()
|
||||
body = (row.content or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
label = "用户" if role == "human" else "AI"
|
||||
out = _assistant_text_for_eval_display(body) if role != "human" else body
|
||||
parts.append(f"{label}: {out}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
|
||||
conversations = await conversation_repo.get_user_conversations(user_id, db)
|
||||
if not conversations:
|
||||
return ""
|
||||
parts: list[str] = []
|
||||
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
|
||||
transcript = await _conversation_transcript_for_manual(db, str(conv.id))
|
||||
if transcript:
|
||||
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
|
||||
return _trim_evidence_text("\n\n".join(parts))
|
||||
|
||||
|
||||
def _normalize_title_key(title: str) -> str:
|
||||
t = (title or "").strip().lower()
|
||||
t = re.sub(r"^#+\s*", "", t)
|
||||
@@ -271,6 +309,7 @@ class EvalJudgeManualService:
|
||||
judge_llm = get_eval_judge_langchain_llm()
|
||||
judge = EvalJudgeService(judge_llm)
|
||||
baselines = list(baseline_sections or [])
|
||||
evidence_transcript = await _user_transcript_evidence(self._db, uid)
|
||||
|
||||
chapter_results: list[dict[str, Any]] = []
|
||||
try:
|
||||
@@ -281,7 +320,7 @@ class EvalJudgeManualService:
|
||||
body = (ch.canonical_markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
bl = _baseline_for_chapter_title(baselines, ch.title or "", i)
|
||||
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
|
||||
baseline_excerpt = ""
|
||||
if bl and (bl.body or "").strip():
|
||||
baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
|
||||
@@ -289,7 +328,14 @@ class EvalJudgeManualService:
|
||||
if baseline_excerpt:
|
||||
md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n"
|
||||
md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
|
||||
cj = await judge.judge_memoir(memoir_markdown=md)
|
||||
cj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
reference_memoir_markdown=baseline_excerpt,
|
||||
evidence_notes=(
|
||||
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
|
||||
),
|
||||
)
|
||||
chapter_results.append(
|
||||
{
|
||||
"id": ch.id,
|
||||
@@ -310,7 +356,13 @@ class EvalJudgeManualService:
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||||
sj = await judge.judge_memoir(memoir_markdown=md)
|
||||
sj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
evidence_notes=(
|
||||
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
|
||||
),
|
||||
)
|
||||
story_results.append(
|
||||
{
|
||||
"id": st.id,
|
||||
|
||||
@@ -1,43 +1,199 @@
|
||||
"""评审 LLM 结构化输出(json_object)。"""
|
||||
"""评审 LLM 结构化输出(json_object)。
|
||||
|
||||
成稿(回忆录)子项上限已自洽为 **总分 100**(由原 110 分表等比例收紧整数档,见附件 rubric)。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Self
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
|
||||
class TurnJudgeOutput(BaseModel):
|
||||
"""单轮对话质量(情绪 + 流畅度/重复抑制 + 总分)。"""
|
||||
"""单轮 / 整段对话质量(情绪强化版 100 分,15 个细项)。"""
|
||||
|
||||
# 一、情绪价值与陪伴感(30)
|
||||
emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
|
||||
empathy_depth: float = Field(ge=0, le=8, description="共情深度")
|
||||
emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
|
||||
emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
|
||||
|
||||
# 二、信息获取能力(25)
|
||||
fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
|
||||
info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
|
||||
info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
|
||||
|
||||
# 三、人物建模能力(15)
|
||||
persona_understanding: float = Field(ge=0, le=7, description="人物理解")
|
||||
persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
|
||||
persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
|
||||
|
||||
# 四、结构化引导(15)
|
||||
interview_structure: float = Field(ge=0, le=6, description="访谈结构")
|
||||
context_memory: float = Field(ge=0, le=5, description="上下文记忆")
|
||||
rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
|
||||
|
||||
# 五、提问质量(15)
|
||||
question_quality: float = Field(ge=0, le=7, description="问题质量")
|
||||
follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
|
||||
non_leading: float = Field(ge=0, le=3, description="非引导性")
|
||||
|
||||
total_score: float = Field(ge=0, le=100)
|
||||
rationale: str = ""
|
||||
|
||||
# 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写)
|
||||
emotion_score: float = Field(default=0, ge=0, le=30)
|
||||
information_score: float = Field(default=0, ge=0, le=20)
|
||||
structure_score: float = Field(default=0, ge=0, le=10)
|
||||
question_score: float = Field(default=0, ge=0, le=10)
|
||||
persona_score: float = Field(default=0, ge=0, le=10)
|
||||
repetition_score: float = Field(default=0, ge=0, le=10)
|
||||
naturalness_score: float = Field(default=0, ge=0, le=10)
|
||||
rationale: str = ""
|
||||
information_score: float = Field(default=0, ge=0, le=25)
|
||||
persona_score: float = Field(default=0, ge=0, le=15)
|
||||
structure_score: float = Field(default=0, ge=0, le=15)
|
||||
question_score: float = Field(default=0, ge=0, le=15)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _sync_aggregates_and_total(self) -> Self:
|
||||
emotion = (
|
||||
self.emotion_carry
|
||||
+ self.empathy_depth
|
||||
+ self.emotion_safety
|
||||
+ self.emotion_guidance
|
||||
)
|
||||
information = (
|
||||
self.fact_mining + self.info_completeness_guide + self.info_depth_mining
|
||||
)
|
||||
persona = (
|
||||
self.persona_understanding
|
||||
+ self.persona_consistency_verify
|
||||
+ self.persona_expression_guide
|
||||
)
|
||||
structure = self.interview_structure + self.context_memory + self.rhythm_control
|
||||
question = self.question_quality + self.follow_up_depth + self.non_leading
|
||||
expected = emotion + information + persona + structure + question
|
||||
if abs(expected - self.total_score) > 0.51:
|
||||
raise ValueError(
|
||||
f"total_score ({self.total_score}) 与细项合计 ({expected:.2f}) 不一致"
|
||||
)
|
||||
object.__setattr__(self, "emotion_score", emotion)
|
||||
object.__setattr__(self, "information_score", information)
|
||||
object.__setattr__(self, "persona_score", persona)
|
||||
object.__setattr__(self, "structure_score", structure)
|
||||
object.__setattr__(self, "question_score", question)
|
||||
return self
|
||||
|
||||
|
||||
class ConversationJudgeOutput(BaseModel):
|
||||
"""整条对话 transcript 的综合分。"""
|
||||
|
||||
total_score: float = Field(ge=0, le=100)
|
||||
dimension_scores: dict[str, float] = Field(default_factory=dict)
|
||||
rationale: str = ""
|
||||
# 整条 transcript 与单轮使用同一套细项
|
||||
ConversationJudgeOutput = TurnJudgeOutput
|
||||
|
||||
|
||||
class MemoirJudgeOutput(BaseModel):
|
||||
"""成稿回忆录评分。"""
|
||||
"""成稿回忆录评分(总分 100,子项上限见 rubric)。"""
|
||||
|
||||
# 一、真实性与覆盖(小计最高 23;由原 25 收紧)
|
||||
mem_fidelity: float = Field(ge=0, le=9, description="记忆忠实度")
|
||||
mem_factual_accuracy: float = Field(ge=0, le=5, description="事实准确性")
|
||||
mem_factual_coverage: float = Field(ge=0, le=5, description="事实覆盖率")
|
||||
mem_traceability: float = Field(ge=0, le=4, description="记忆可追溯性")
|
||||
|
||||
# 二、信息质量(小计最高 14;由原 15 收紧)
|
||||
info_slot_coverage: float = Field(ge=0, le=6, description="槽位覆盖度")
|
||||
info_sufficiency: float = Field(ge=0, le=4, description="信息充分性")
|
||||
info_density: float = Field(ge=0, le=4, description="信息密度")
|
||||
|
||||
# 三、叙事结构(小计最高 14;由原 15 收紧)
|
||||
narr_structure: float = Field(ge=0, le=6, description="故事结构")
|
||||
narr_paragraphs: float = Field(ge=0, le=5, description="段落组织")
|
||||
narr_pacing: float = Field(ge=0, le=3, description="节奏控制")
|
||||
|
||||
# 四、语言与文笔(小计最高 18;由原 20 及六项上限一并收紧)
|
||||
lang_fluency: float = Field(ge=0, le=3, description="语言流畅度")
|
||||
lang_conciseness: float = Field(ge=0, le=3, description="表达精炼度")
|
||||
lang_literary: float = Field(ge=0, le=4, description="文笔质量")
|
||||
lang_controlled_expansion: float = Field(ge=0, le=4, description="控制性扩写能力")
|
||||
lang_detail: float = Field(ge=0, le=2, description="细节还原与强化")
|
||||
lang_style: float = Field(ge=0, le=2, description="风格一致性")
|
||||
|
||||
# 五、情感表达(小计最高 9;由原 10 收紧)
|
||||
emo_authenticity: float = Field(ge=0, le=5, description="情感真实度")
|
||||
emo_depth: float = Field(ge=0, le=4, description="情感深度")
|
||||
|
||||
# 六、人物建模(小计最高 9;由原 10 收紧)
|
||||
char_understanding: float = Field(ge=0, le=4, description="人物理解")
|
||||
char_consistency: float = Field(ge=0, le=3, description="人物一致性")
|
||||
char_integration: float = Field(ge=0, le=2, description="人物融入度")
|
||||
|
||||
# 七、连贯性(小计最高 4;由原 5 收紧)
|
||||
coh_timeline: float = Field(ge=0, le=2, description="时间线一致性")
|
||||
coh_cross_chapter: float = Field(ge=0, le=2, description="跨章节关联")
|
||||
|
||||
# 八、表达丰富度(小计最高 5)
|
||||
rich_analogy: float = Field(ge=0, le=3, description="类比与引用")
|
||||
rich_diversity: float = Field(ge=0, le=2, description="表达多样性")
|
||||
|
||||
# 九、出版就绪度(小计最高 4;由原 5 收紧)
|
||||
pub_editorial_cost: float = Field(ge=0, le=2, description="编辑成本")
|
||||
pub_completeness: float = Field(ge=0, le=2, description="完整度")
|
||||
|
||||
total_score: float = Field(ge=0, le=100)
|
||||
authenticity_score: float = Field(default=0, ge=0, le=25)
|
||||
information_score: float = Field(default=0, ge=0, le=15)
|
||||
narrative_score: float = Field(default=0, ge=0, le=15)
|
||||
language_score: float = Field(default=0, ge=0, le=20)
|
||||
emotion_score: float = Field(default=0, ge=0, le=10)
|
||||
character_score: float = Field(default=0, ge=0, le=10)
|
||||
coherence_score: float = Field(default=0, ge=0, le=5)
|
||||
richness_score: float = Field(default=0, ge=0, le=5)
|
||||
publish_ready_score: float = Field(default=0, ge=0, le=5)
|
||||
rationale: str = ""
|
||||
|
||||
authenticity_score: float = Field(default=0, ge=0, le=23)
|
||||
information_score: float = Field(default=0, ge=0, le=14)
|
||||
narrative_score: float = Field(default=0, ge=0, le=14)
|
||||
language_score: float = Field(default=0, ge=0, le=18)
|
||||
emotion_score: float = Field(default=0, ge=0, le=9)
|
||||
character_score: float = Field(default=0, ge=0, le=9)
|
||||
coherence_score: float = Field(default=0, ge=0, le=4)
|
||||
richness_score: float = Field(default=0, ge=0, le=5)
|
||||
publish_ready_score: float = Field(default=0, ge=0, le=4)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _sync_aggregates_and_total(self) -> Self:
|
||||
authenticity = (
|
||||
self.mem_fidelity
|
||||
+ self.mem_factual_accuracy
|
||||
+ self.mem_factual_coverage
|
||||
+ self.mem_traceability
|
||||
)
|
||||
information = (
|
||||
self.info_slot_coverage + self.info_sufficiency + self.info_density
|
||||
)
|
||||
narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
|
||||
language = (
|
||||
self.lang_fluency
|
||||
+ self.lang_conciseness
|
||||
+ self.lang_literary
|
||||
+ self.lang_controlled_expansion
|
||||
+ self.lang_detail
|
||||
+ self.lang_style
|
||||
)
|
||||
emotion = self.emo_authenticity + self.emo_depth
|
||||
character = (
|
||||
self.char_understanding + self.char_consistency + self.char_integration
|
||||
)
|
||||
coherence = self.coh_timeline + self.coh_cross_chapter
|
||||
richness = self.rich_analogy + self.rich_diversity
|
||||
publish = self.pub_editorial_cost + self.pub_completeness
|
||||
expected = (
|
||||
authenticity
|
||||
+ information
|
||||
+ narrative
|
||||
+ language
|
||||
+ emotion
|
||||
+ character
|
||||
+ coherence
|
||||
+ richness
|
||||
+ publish
|
||||
)
|
||||
if abs(expected - self.total_score) > 0.51:
|
||||
raise ValueError(
|
||||
f"total_score ({self.total_score}) 与分项合计 ({expected:.2f}) 不一致"
|
||||
)
|
||||
object.__setattr__(self, "authenticity_score", authenticity)
|
||||
object.__setattr__(self, "information_score", information)
|
||||
object.__setattr__(self, "narrative_score", narrative)
|
||||
object.__setattr__(self, "language_score", language)
|
||||
object.__setattr__(self, "emotion_score", emotion)
|
||||
object.__setattr__(self, "character_score", character)
|
||||
object.__setattr__(self, "coherence_score", coherence)
|
||||
object.__setattr__(self, "richness_score", richness)
|
||||
object.__setattr__(self, "publish_ready_score", publish)
|
||||
return self
|
||||
|
||||
@@ -26,6 +26,37 @@ _CONV_MAX = 8192
|
||||
_CONV_JUDGE_JSON_MAX = 2048
|
||||
_MEMOIR_MAX = 12000
|
||||
_COMPARE_STREAM_MAX = 6144
|
||||
_MEMOIR_EVIDENCE_MAX = 12000
|
||||
|
||||
|
||||
def _build_memoir_judge_prompt(
|
||||
*,
|
||||
memoir_markdown: str,
|
||||
source_transcript: str = "",
|
||||
reference_memoir_markdown: str = "",
|
||||
evidence_notes: str = "",
|
||||
) -> str:
|
||||
"""Assemble an evidence-aware memoir judging prompt."""
|
||||
source = (source_transcript or "").strip()
|
||||
reference = (reference_memoir_markdown or "").strip()
|
||||
notes = (evidence_notes or "").strip()
|
||||
sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
|
||||
if notes:
|
||||
sections.extend(["【评审说明】", notes[:1200], ""])
|
||||
if source:
|
||||
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
|
||||
else:
|
||||
sections.extend(
|
||||
[
|
||||
"【原始访谈/证据】",
|
||||
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
|
||||
"",
|
||||
]
|
||||
)
|
||||
if reference:
|
||||
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
|
||||
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
|
||||
return "\n".join(sections)
|
||||
|
||||
|
||||
class EvalJudgeService:
|
||||
@@ -124,7 +155,7 @@ class EvalJudgeService:
|
||||
{r_json}
|
||||
|
||||
请依次撰写:
|
||||
1) 两段对话在整体体验上的主要差异(共情、追问、重复感、自然度等);
|
||||
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
|
||||
2) B 相对 A 的优点与不足;
|
||||
3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。
|
||||
|
||||
@@ -154,14 +185,22 @@ class EvalJudgeService:
|
||||
logger.warning("conversation compare stream failed: {}", e)
|
||||
yield f"\n\n[流式输出中断:{e}]"
|
||||
|
||||
async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
|
||||
async def judge_memoir(
|
||||
self,
|
||||
*,
|
||||
memoir_markdown: str,
|
||||
source_transcript: str = "",
|
||||
reference_memoir_markdown: str = "",
|
||||
evidence_notes: str = "",
|
||||
) -> MemoirJudgeOutput | None:
|
||||
if not self._llm:
|
||||
return None
|
||||
prompt = f"""{MEMOIR_JUDGE_INSTRUCTIONS}
|
||||
|
||||
【回忆录正文】
|
||||
{memoir_markdown[:_MEMOIR_MAX]}
|
||||
"""
|
||||
prompt = _build_memoir_judge_prompt(
|
||||
memoir_markdown=memoir_markdown,
|
||||
source_transcript=source_transcript,
|
||||
reference_memoir_markdown=reference_memoir_markdown,
|
||||
evidence_notes=evidence_notes,
|
||||
)
|
||||
try:
|
||||
return await allm_json_call(
|
||||
self._llm,
|
||||
|
||||
@@ -1,31 +1,60 @@
|
||||
"""对话评审 rubric 文本(v1)。"""
|
||||
"""对话评审 rubric 文本(v1 · 访谈「情绪强化版」100 分)。"""
|
||||
|
||||
TURN_JUDGE_INSTRUCTIONS = """你是「岁月留书」访谈对话质量评审。根据下面维度给本轮 AI 回复打分(0-100 为 total_score,各子分上限已注明,子分之和应与 total_score 大体一致)。
|
||||
_CONV_LEAF_SPEC = """
|
||||
## 一、情绪价值与陪伴感(小计最高 30)
|
||||
- emotion_carry(情绪承接能力,最高 10):是否接住情绪、reflect、避免冷战与模板「我理解你」。
|
||||
- empathy_depth(共情深度,最高 8):情绪类型是否准、语境贴合、非空洞安慰。
|
||||
- emotion_safety(情绪安全感,最高 6):非评判、尊重、敏感话题语气、可跳过。
|
||||
- emotion_guidance(情绪引导能力,最高 6):引导感受、关键节点追问情绪、表达更具体。
|
||||
|
||||
维度(参考):
|
||||
- 情绪承接与共情(emotion_score,最高 30)
|
||||
- 信息获取与追问(information_score,最高 20)
|
||||
- 结构化访谈推进(structure_score,最高 10)
|
||||
- 提问质量(question_score,最高 10)
|
||||
- 人物理解与一致性(persona_score,最高 10)
|
||||
- 重复抑制(repetition_score,最高 10):是否重复了上 1~2 轮已问过的问题或同一资料槽;高度重复则低分
|
||||
- 自然流畅(naturalness_score,最高 10):是否像朋友聊天;有无不必要采访腔、总结腔、流程感
|
||||
## 二、信息获取能力(小计最高 25)
|
||||
- fact_mining(关键事实挖掘,最高 8)
|
||||
- info_completeness_guide(信息完整性引导,最高 8)
|
||||
- info_depth_mining(信息深度挖掘,最高 9):为何、动机与影响。
|
||||
|
||||
输出 JSON:**json** 字段名如下:
|
||||
total_score, emotion_score, information_score, structure_score, question_score, persona_score, repetition_score, naturalness_score, rationale
|
||||
## 三、人物建模能力(小计最高 15)
|
||||
- persona_understanding(人物理解,最高 7)
|
||||
- persona_consistency_verify(人物一致性验证,最高 4):矛盾澄清。
|
||||
- persona_expression_guide(人物表达引导,最高 4):「你是谁」层面。
|
||||
|
||||
只输出 JSON。"""
|
||||
## 四、结构化引导(小计最高 15)
|
||||
- interview_structure(访谈结构,最高 6):阶段与逻辑。
|
||||
- context_memory(上下文记忆,最高 5):关联前文;**重复盘问、同一槽位反复**在此项扣分。
|
||||
- rhythm_control(节奏控制,最高 4):自然;**采访腔、总结腔、流程感**在此项与情绪项综合体现。
|
||||
|
||||
## 五、提问质量(小计最高 15)
|
||||
- question_quality(问题质量,最高 7):开放、具体。
|
||||
- follow_up_depth(追问能力,最高 5)
|
||||
- non_leading(非引导性,最高 3)
|
||||
|
||||
输出 JSON 字段(仅这些键;务必含 rationale):
|
||||
emotion_carry, empathy_depth, emotion_safety, emotion_guidance,
|
||||
fact_mining, info_completeness_guide, info_depth_mining,
|
||||
persona_understanding, persona_consistency_verify, persona_expression_guide,
|
||||
interview_structure, context_memory, rhythm_control,
|
||||
question_quality, follow_up_depth, non_leading,
|
||||
total_score, rationale
|
||||
|
||||
`total_score` 必须等于上述 15 个细项之和(满分 100)。
|
||||
聚合分 emotion_score、information_score、persona_score、structure_score、question_score 可不填(服务端会重算)。
|
||||
只输出 JSON。
|
||||
"""
|
||||
|
||||
|
||||
CONV_JUDGE_INSTRUCTIONS = """你是访谈整段对话评审。给定完整 transcript(用户与 AI 多轮),打一个综合 total_score(0-100)。
|
||||
TURN_JUDGE_INSTRUCTIONS = (
|
||||
"你是「岁月留书」访谈对话质量评审,按下列 **情绪强化版** rubric 为本轮 AI 回复打分。\n"
|
||||
+ _CONV_LEAF_SPEC
|
||||
)
|
||||
|
||||
dimension_scores 建议至少包含:emotion, information, structure, repetition, naturalness(各 0-100 相对分量即可),用于反映整段是否重复盘问、是否自然;另可有 rationale。
|
||||
|
||||
只输出 JSON:total_score, dimension_scores, rationale。"""
|
||||
CONV_JUDGE_INSTRUCTIONS = (
|
||||
"你是访谈整段对话评审。给定完整 transcript(用户与 AI 多轮),按与单轮**相同**的 15 项细项与满分上限,"
|
||||
"对**整段对话表现**打一次分;`total_score` 为细项之和(100)。\n" + _CONV_LEAF_SPEC
|
||||
)
|
||||
|
||||
|
||||
COMPARE_CONV_STREAM_HINT = """你是访谈对话评测专家。下面给出一份「回放/新测」完整对话 transcript 及其整体评分(JSON)。请用中文直接写正文(不要用 JSON):
|
||||
1) 对这段对话的整体评价与风险点;
|
||||
1) 对这段对话的整体评价与风险点(对照情绪承接、信息挖掘、人物、结构、提问质量等);
|
||||
2) 可操作的改进建议(提示词、流程、模型参数等)。
|
||||
|
||||
笔调简洁、可执行。"""
|
||||
|
||||
@@ -1,11 +1,73 @@
|
||||
"""回忆录成稿评审 rubric 文本(v1)。"""
|
||||
"""回忆录成稿评审 rubric 文本(v1 · 子项上限合计 100 分制)。
|
||||
|
||||
MEMOIR_JUDGE_INSTRUCTIONS = """你是「岁月留书」回忆录成稿评审。根据真实性与覆盖、信息质量、叙事结构、语言文笔、情感、人物、连贯性、表达丰富度、出版就绪等,给出分项分(上限与 total_score 满分 100 一致)。
|
||||
说明:原产品表九个大类上限之和为 110;本 rubric 将各细项上限整档收紧,使九类小计之和为 100,
|
||||
便于与 `total_score` 直接一致,无需再折算。
|
||||
"""
|
||||
|
||||
输出 JSON 字段:
|
||||
total_score,
|
||||
authenticity_score, information_score, narrative_score, language_score,
|
||||
emotion_score, character_score, coherence_score, richness_score, publish_ready_score,
|
||||
rationale
|
||||
_MEMOIR_RUBRIC_BODY = """
|
||||
你必须按下列一级维度与子项及其**满分上限**打分;**全部细项得分之和须等于 `total_score`,且满分合计为 100**。
|
||||
|
||||
## 一、真实性与覆盖(小计最高 23)
|
||||
1. mem_fidelity(记忆忠实度,最高 9):hallucination、夸大/弱化/改写、因果关系、未证实推测、AI 补全编造。
|
||||
2. mem_factual_accuracy(事实准确性,最高 5):时间、人物关系、顺序、内部矛盾、数值细节。
|
||||
3. mem_factual_coverage(事实覆盖率,最高 5):关键/高情感事件、关键人物与细节是否遗漏。
|
||||
4. mem_traceability(记忆可追溯性,最高 4):与原始对话映射、来源模糊、证据与语义保持。
|
||||
|
||||
## 二、信息质量(小计最高 14)
|
||||
5. info_slot_coverage(槽位覆盖度,最高 6)
|
||||
6. info_sufficiency(信息充分性,最高 4)
|
||||
7. info_density(信息密度,最高 4)
|
||||
|
||||
## 三、叙事结构(小计最高 14)
|
||||
8. narr_structure(故事结构,最高 6)
|
||||
9. narr_paragraphs(段落组织,最高 5)
|
||||
10. narr_pacing(节奏控制,最高 3)
|
||||
|
||||
## 四、语言与文笔(小计最高 18)
|
||||
11. lang_fluency(语言流畅度,最高 3)
|
||||
12. lang_conciseness(表达精炼度,最高 3)
|
||||
13. lang_literary(文笔质量,最高 4)
|
||||
14. lang_controlled_expansion(控制性扩写,最高 4)
|
||||
15. lang_detail(细节还原与强化,最高 2)
|
||||
16. lang_style(风格一致性,最高 2)
|
||||
|
||||
## 五、情感表达(小计最高 9)
|
||||
17. emo_authenticity(情感真实度,最高 5)
|
||||
18. emo_depth(情感深度,最高 4)
|
||||
|
||||
## 六、人物建模(小计最高 9)
|
||||
19. char_understanding(人物理解,最高 4)
|
||||
20. char_consistency(人物一致性,最高 3)
|
||||
21. char_integration(人物融入度,最高 2)
|
||||
|
||||
## 七、连贯性(小计最高 4)
|
||||
22. coh_timeline(时间线一致性,最高 2)
|
||||
23. coh_cross_chapter(跨章节关联,最高 2)
|
||||
|
||||
## 八、表达丰富度(小计最高 5)
|
||||
24. rich_analogy(类比与引用,最高 3)
|
||||
25. rich_diversity(表达多样性,最高 2)
|
||||
|
||||
## 九、出版就绪度(小计最高 4)
|
||||
26. pub_editorial_cost(编辑成本,最高 2)
|
||||
27. pub_completeness(完整度,最高 2)
|
||||
|
||||
输出 JSON 字段(仅这些键;分值浮点;务必含 rationale 中文简述):
|
||||
mem_fidelity, mem_factual_accuracy, mem_factual_coverage, mem_traceability,
|
||||
info_slot_coverage, info_sufficiency, info_density,
|
||||
narr_structure, narr_paragraphs, narr_pacing,
|
||||
lang_fluency, lang_conciseness, lang_literary, lang_controlled_expansion, lang_detail, lang_style,
|
||||
emo_authenticity, emo_depth,
|
||||
char_understanding, char_consistency, char_integration,
|
||||
coh_timeline, coh_cross_chapter,
|
||||
rich_analogy, rich_diversity,
|
||||
pub_editorial_cost, pub_completeness,
|
||||
total_score, rationale
|
||||
|
||||
一级聚合分 authenticity_score、information_score、narrative_score、language_score、emotion_score、character_score、coherence_score、richness_score、publish_ready_score 可不填(服务端会按细项重算)。
|
||||
只输出 JSON。"""
|
||||
|
||||
MEMOIR_JUDGE_INSTRUCTIONS = (
|
||||
"你是「岁月留书」回忆录成稿评审,必须严格按照下列 rubric 打分。\n"
|
||||
+ _MEMOIR_RUBRIC_BODY
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user