Files
life-echo/api/app/features/evaluation/judge_schemas.py
Kevin 5972b0e721 feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整
- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断)
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试
2026-04-07 10:36:22 +08:00

200 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""评审 LLM 结构化输出json_object
成稿(回忆录)子项上限已自洽为 **总分 100**(由原 110 分表等比例收紧整数档,见附件 rubric
"""
from __future__ import annotations
from typing import Self
from pydantic import BaseModel, Field, model_validator
class TurnJudgeOutput(BaseModel):
"""单轮 / 整段对话质量(情绪强化版 100 分15 个细项)。"""
# 一、情绪价值与陪伴感30
emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
empathy_depth: float = Field(ge=0, le=8, description="共情深度")
emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
# 二、信息获取能力25
fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
# 三、人物建模能力15
persona_understanding: float = Field(ge=0, le=7, description="人物理解")
persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
# 四、结构化引导15
interview_structure: float = Field(ge=0, le=6, description="访谈结构")
context_memory: float = Field(ge=0, le=5, description="上下文记忆")
rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
# 五、提问质量15
question_quality: float = Field(ge=0, le=7, description="问题质量")
follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
non_leading: float = Field(ge=0, le=3, description="非引导性")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
# 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写)
emotion_score: float = Field(default=0, ge=0, le=30)
information_score: float = Field(default=0, ge=0, le=25)
persona_score: float = Field(default=0, ge=0, le=15)
structure_score: float = Field(default=0, ge=0, le=15)
question_score: float = Field(default=0, ge=0, le=15)
@model_validator(mode="after")
def _sync_aggregates_and_total(self) -> Self:
emotion = (
self.emotion_carry
+ self.empathy_depth
+ self.emotion_safety
+ self.emotion_guidance
)
information = (
self.fact_mining + self.info_completeness_guide + self.info_depth_mining
)
persona = (
self.persona_understanding
+ self.persona_consistency_verify
+ self.persona_expression_guide
)
structure = self.interview_structure + self.context_memory + self.rhythm_control
question = self.question_quality + self.follow_up_depth + self.non_leading
expected = emotion + information + persona + structure + question
if abs(expected - self.total_score) > 0.51:
raise ValueError(
f"total_score ({self.total_score}) 与细项合计 ({expected:.2f}) 不一致"
)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "persona_score", persona)
object.__setattr__(self, "structure_score", structure)
object.__setattr__(self, "question_score", question)
return self
# 整条 transcript 与单轮使用同一套细项
ConversationJudgeOutput = TurnJudgeOutput
class MemoirJudgeOutput(BaseModel):
"""成稿回忆录评分(总分 100子项上限见 rubric"""
# 一、真实性与覆盖(小计最高 23由原 25 收紧)
mem_fidelity: float = Field(ge=0, le=9, description="记忆忠实度")
mem_factual_accuracy: float = Field(ge=0, le=5, description="事实准确性")
mem_factual_coverage: float = Field(ge=0, le=5, description="事实覆盖率")
mem_traceability: float = Field(ge=0, le=4, description="记忆可追溯性")
# 二、信息质量(小计最高 14由原 15 收紧)
info_slot_coverage: float = Field(ge=0, le=6, description="槽位覆盖度")
info_sufficiency: float = Field(ge=0, le=4, description="信息充分性")
info_density: float = Field(ge=0, le=4, description="信息密度")
# 三、叙事结构(小计最高 14由原 15 收紧)
narr_structure: float = Field(ge=0, le=6, description="故事结构")
narr_paragraphs: float = Field(ge=0, le=5, description="段落组织")
narr_pacing: float = Field(ge=0, le=3, description="节奏控制")
# 四、语言与文笔(小计最高 18由原 20 及六项上限一并收紧)
lang_fluency: float = Field(ge=0, le=3, description="语言流畅度")
lang_conciseness: float = Field(ge=0, le=3, description="表达精炼度")
lang_literary: float = Field(ge=0, le=4, description="文笔质量")
lang_controlled_expansion: float = Field(ge=0, le=4, description="控制性扩写能力")
lang_detail: float = Field(ge=0, le=2, description="细节还原与强化")
lang_style: float = Field(ge=0, le=2, description="风格一致性")
# 五、情感表达(小计最高 9由原 10 收紧)
emo_authenticity: float = Field(ge=0, le=5, description="情感真实度")
emo_depth: float = Field(ge=0, le=4, description="情感深度")
# 六、人物建模(小计最高 9由原 10 收紧)
char_understanding: float = Field(ge=0, le=4, description="人物理解")
char_consistency: float = Field(ge=0, le=3, description="人物一致性")
char_integration: float = Field(ge=0, le=2, description="人物融入度")
# 七、连贯性(小计最高 4由原 5 收紧)
coh_timeline: float = Field(ge=0, le=2, description="时间线一致性")
coh_cross_chapter: float = Field(ge=0, le=2, description="跨章节关联")
# 八、表达丰富度(小计最高 5
rich_analogy: float = Field(ge=0, le=3, description="类比与引用")
rich_diversity: float = Field(ge=0, le=2, description="表达多样性")
# 九、出版就绪度(小计最高 4由原 5 收紧)
pub_editorial_cost: float = Field(ge=0, le=2, description="编辑成本")
pub_completeness: float = Field(ge=0, le=2, description="完整度")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
authenticity_score: float = Field(default=0, ge=0, le=23)
information_score: float = Field(default=0, ge=0, le=14)
narrative_score: float = Field(default=0, ge=0, le=14)
language_score: float = Field(default=0, ge=0, le=18)
emotion_score: float = Field(default=0, ge=0, le=9)
character_score: float = Field(default=0, ge=0, le=9)
coherence_score: float = Field(default=0, ge=0, le=4)
richness_score: float = Field(default=0, ge=0, le=5)
publish_ready_score: float = Field(default=0, ge=0, le=4)
@model_validator(mode="after")
def _sync_aggregates_and_total(self) -> Self:
authenticity = (
self.mem_fidelity
+ self.mem_factual_accuracy
+ self.mem_factual_coverage
+ self.mem_traceability
)
information = (
self.info_slot_coverage + self.info_sufficiency + self.info_density
)
narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
language = (
self.lang_fluency
+ self.lang_conciseness
+ self.lang_literary
+ self.lang_controlled_expansion
+ self.lang_detail
+ self.lang_style
)
emotion = self.emo_authenticity + self.emo_depth
character = (
self.char_understanding + self.char_consistency + self.char_integration
)
coherence = self.coh_timeline + self.coh_cross_chapter
richness = self.rich_analogy + self.rich_diversity
publish = self.pub_editorial_cost + self.pub_completeness
expected = (
authenticity
+ information
+ narrative
+ language
+ emotion
+ character
+ coherence
+ richness
+ publish
)
if abs(expected - self.total_score) > 0.51:
raise ValueError(
f"total_score ({self.total_score}) 与分项合计 ({expected:.2f}) 不一致"
)
object.__setattr__(self, "authenticity_score", authenticity)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "narrative_score", narrative)
object.__setattr__(self, "language_score", language)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "character_score", character)
object.__setattr__(self, "coherence_score", coherence)
object.__setattr__(self, "richness_score", richness)
object.__setattr__(self, "publish_ready_score", publish)
return self