Files
life-echo/api/app/features/evaluation/judge_schemas.py

334 lines
14 KiB
Python
Raw Normal View History

"""评审 LLM 结构化输出json_object
成稿回忆录子项上限已自洽为 **总分 100**由原 110 分表等比例收紧整数档见附件 rubric
"""
from __future__ import annotations
from typing import Any, Self
from pydantic import BaseModel, ConfigDict, Field, model_validator
class JudgeEvidenceRef(BaseModel):
"""评审引用:便于人工复核(对话 Turn 或成稿片段定位)。"""
model_config = ConfigDict(extra="ignore")
dimension: str = ""
turn_index: int = Field(default=-1, ge=-1)
snippet: str = Field(default="", max_length=400)
def _is_judge_list_placeholder_empty(s: str) -> bool:
"""LLM 有时输出单句占位(如 'None identified.')而非 JSON 数组,按空列表处理。"""
t = s.strip()
if not t:
return True
tl = t.lower().rstrip(".")
if tl in (
"none",
"none identified",
"n/a",
"na",
"-",
"nil",
"null",
"no issues",
"no issue",
"not applicable",
):
return True
tc = t.rstrip("")
if tc in ("", "暂无", "未发现", "没有"):
return True
return False
def _coerce_judge_str_list(value: Any) -> list[Any]:
"""将评审 JSON 中的 list[str] 字段从 str / null 规范为列表(兼容 GLM-5 等输出的非数组形态)。"""
if value is None:
return []
if isinstance(value, list):
return value
if isinstance(value, str):
s = value.strip()
if _is_judge_list_placeholder_empty(s):
return []
return [s]
return []
class TurnJudgeOutput(BaseModel):
"""单轮 / 整段对话质量(情绪强化版 100 分15 个细项)。"""
model_config = ConfigDict(extra="ignore")
# 一、情绪价值与陪伴感30
emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
empathy_depth: float = Field(ge=0, le=8, description="共情深度")
emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
# 二、信息获取能力25
fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
# 三、人物建模能力15
persona_understanding: float = Field(ge=0, le=7, description="人物理解")
persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
# 四、结构化引导15
interview_structure: float = Field(ge=0, le=6, description="访谈结构")
context_memory: float = Field(ge=0, le=5, description="上下文记忆")
rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
# 五、提问质量15
question_quality: float = Field(ge=0, le=7, description="问题质量")
follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
non_leading: float = Field(ge=0, le=3, description="非引导性")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
major_strengths: list[str] = Field(default_factory=list)
major_issues: list[str] = Field(default_factory=list)
insufficient_evidence: list[str] = Field(default_factory=list)
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
# 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写)
emotion_score: float = Field(default=0, ge=0, le=30)
information_score: float = Field(default=0, ge=0, le=25)
persona_score: float = Field(default=0, ge=0, le=15)
structure_score: float = Field(default=0, ge=0, le=15)
question_score: float = Field(default=0, ge=0, le=15)
@model_validator(mode="before")
@classmethod
def _coerce_null_lists(cls, data: Any) -> Any:
if isinstance(data, dict):
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
data[key] = _coerce_judge_str_list(data.get(key))
if data.get("evidence_refs") is None:
data["evidence_refs"] = []
return data
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()
if s:
out.append(s[:max_chars])
return out
object.__setattr__(
self,
"major_strengths",
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
)
object.__setattr__(
self,
"major_issues",
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
)
object.__setattr__(
self,
"insufficient_evidence",
_cap_str_list(self.insufficient_evidence, max_items=10, max_chars=200),
)
refs = list(self.evidence_refs)[:12]
object.__setattr__(self, "evidence_refs", refs)
emotion = (
self.emotion_carry
+ self.empathy_depth
+ self.emotion_safety
+ self.emotion_guidance
)
information = (
self.fact_mining + self.info_completeness_guide + self.info_depth_mining
)
persona = (
self.persona_understanding
+ self.persona_consistency_verify
+ self.persona_expression_guide
)
structure = self.interview_structure + self.context_memory + self.rhythm_control
question = self.question_quality + self.follow_up_depth + self.non_leading
expected = emotion + information + persona + structure + question
# 细项为唯一事实来源LLM 常把 total_score 写成 100 与前四项打满但情绪块少 1 分等情况不一致
synced = max(0.0, min(100.0, round(float(expected), 2)))
object.__setattr__(self, "total_score", synced)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "persona_score", persona)
object.__setattr__(self, "structure_score", structure)
object.__setattr__(self, "question_score", question)
return self
# 整条 transcript 与单轮使用同一套细项
ConversationJudgeOutput = TurnJudgeOutput
class MemoirJudgeOutput(BaseModel):
"""成稿回忆录评分(总分 100子项上限见 rubric"""
model_config = ConfigDict(extra="ignore")
# 一、真实性与覆盖(小计最高 23由原 25 收紧)
mem_fidelity: float = Field(ge=0, le=9, description="记忆忠实度")
mem_factual_accuracy: float = Field(ge=0, le=5, description="事实准确性")
mem_factual_coverage: float = Field(ge=0, le=5, description="事实覆盖率")
mem_traceability: float = Field(ge=0, le=4, description="记忆可追溯性")
# 二、信息质量(小计最高 14由原 15 收紧)
info_slot_coverage: float = Field(ge=0, le=6, description="槽位覆盖度")
info_sufficiency: float = Field(ge=0, le=4, description="信息充分性")
info_density: float = Field(ge=0, le=4, description="信息密度")
# 三、叙事结构(小计最高 14由原 15 收紧)
narr_structure: float = Field(ge=0, le=6, description="故事结构")
narr_paragraphs: float = Field(ge=0, le=5, description="段落组织")
narr_pacing: float = Field(ge=0, le=3, description="节奏控制")
# 四、语言与文笔(小计最高 18由原 20 及六项上限一并收紧)
lang_fluency: float = Field(ge=0, le=3, description="语言流畅度")
lang_conciseness: float = Field(ge=0, le=3, description="表达精炼度")
lang_literary: float = Field(ge=0, le=4, description="文笔质量")
lang_controlled_expansion: float = Field(ge=0, le=4, description="控制性扩写能力")
lang_detail: float = Field(ge=0, le=2, description="细节还原与强化")
lang_style: float = Field(ge=0, le=2, description="风格一致性")
# 五、情感表达(小计最高 9由原 10 收紧)
emo_authenticity: float = Field(ge=0, le=5, description="情感真实度")
emo_depth: float = Field(ge=0, le=4, description="情感深度")
# 六、人物建模(小计最高 9由原 10 收紧)
char_understanding: float = Field(ge=0, le=4, description="人物理解")
char_consistency: float = Field(ge=0, le=3, description="人物一致性")
char_integration: float = Field(ge=0, le=2, description="人物融入度")
# 七、连贯性(小计最高 4由原 5 收紧)
coh_timeline: float = Field(ge=0, le=2, description="时间线一致性")
coh_cross_chapter: float = Field(ge=0, le=2, description="跨章节关联")
# 八、表达丰富度(小计最高 5
rich_analogy: float = Field(ge=0, le=3, description="类比与引用")
rich_diversity: float = Field(ge=0, le=2, description="表达多样性")
# 九、出版就绪度(小计最高 4由原 5 收紧)
pub_editorial_cost: float = Field(ge=0, le=2, description="编辑成本")
pub_completeness: float = Field(ge=0, le=2, description="完整度")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
major_strengths: list[str] = Field(default_factory=list)
major_issues: list[str] = Field(default_factory=list)
insufficient_evidence: list[str] = Field(default_factory=list)
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
authenticity_score: float = Field(default=0, ge=0, le=23)
information_score: float = Field(default=0, ge=0, le=14)
narrative_score: float = Field(default=0, ge=0, le=14)
language_score: float = Field(default=0, ge=0, le=18)
emotion_score: float = Field(default=0, ge=0, le=9)
character_score: float = Field(default=0, ge=0, le=9)
coherence_score: float = Field(default=0, ge=0, le=4)
richness_score: float = Field(default=0, ge=0, le=5)
publish_ready_score: float = Field(default=0, ge=0, le=4)
@model_validator(mode="before")
@classmethod
def _coerce_null_lists(cls, data: Any) -> Any:
if isinstance(data, dict):
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
data[key] = _coerce_judge_str_list(data.get(key))
if data.get("evidence_refs") is None:
data["evidence_refs"] = []
return data
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()
if s:
out.append(s[:max_chars])
return out
object.__setattr__(
self,
"major_strengths",
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
)
object.__setattr__(
self,
"major_issues",
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
)
object.__setattr__(
self,
"insufficient_evidence",
_cap_str_list(self.insufficient_evidence, max_items=12, max_chars=200),
)
refs = list(self.evidence_refs)[:12]
object.__setattr__(self, "evidence_refs", refs)
authenticity = (
self.mem_fidelity
+ self.mem_factual_accuracy
+ self.mem_factual_coverage
+ self.mem_traceability
)
information = (
self.info_slot_coverage + self.info_sufficiency + self.info_density
)
narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
language = (
self.lang_fluency
+ self.lang_conciseness
+ self.lang_literary
+ self.lang_controlled_expansion
+ self.lang_detail
+ self.lang_style
)
emotion = self.emo_authenticity + self.emo_depth
character = (
self.char_understanding + self.char_consistency + self.char_integration
)
coherence = self.coh_timeline + self.coh_cross_chapter
richness = self.rich_analogy + self.rich_diversity
publish = self.pub_editorial_cost + self.pub_completeness
expected = (
authenticity
+ information
+ narrative
+ language
+ emotion
+ character
+ coherence
+ richness
+ publish
)
synced = max(0.0, min(100.0, round(float(expected), 2)))
object.__setattr__(self, "total_score", synced)
object.__setattr__(self, "authenticity_score", authenticity)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "narrative_score", narrative)
object.__setattr__(self, "language_score", language)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "character_score", character)
object.__setattr__(self, "coherence_score", coherence)
object.__setattr__(self, "richness_score", richness)
object.__setattr__(self, "publish_ready_score", publish)
return self