Files
life-echo/api/app/features/evaluation/judge_schemas.py
Kevin 6772e1269c feat(evaluation): memoir readiness, judge/replay updates, eval web playground
Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
2026-04-08 09:43:34 +08:00

334 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""评审 LLM 结构化输出json_object
成稿(回忆录)子项上限已自洽为 **总分 100**(由原 110 分表等比例收紧整数档,见附件 rubric
"""
from __future__ import annotations
from typing import Any, Self
from pydantic import BaseModel, ConfigDict, Field, model_validator
class JudgeEvidenceRef(BaseModel):
"""评审引用:便于人工复核(对话 Turn 或成稿片段定位)。"""
model_config = ConfigDict(extra="ignore")
dimension: str = ""
turn_index: int = Field(default=-1, ge=-1)
snippet: str = Field(default="", max_length=400)
def _is_judge_list_placeholder_empty(s: str) -> bool:
"""LLM 有时输出单句占位(如 'None identified.')而非 JSON 数组,按空列表处理。"""
t = s.strip()
if not t:
return True
tl = t.lower().rstrip(".")
if tl in (
"none",
"none identified",
"n/a",
"na",
"-",
"nil",
"null",
"no issues",
"no issue",
"not applicable",
):
return True
tc = t.rstrip("")
if tc in ("", "暂无", "未发现", "没有"):
return True
return False
def _coerce_judge_str_list(value: Any) -> list[Any]:
"""将评审 JSON 中的 list[str] 字段从 str / null 规范为列表(兼容 GLM-5 等输出的非数组形态)。"""
if value is None:
return []
if isinstance(value, list):
return value
if isinstance(value, str):
s = value.strip()
if _is_judge_list_placeholder_empty(s):
return []
return [s]
return []
class TurnJudgeOutput(BaseModel):
"""单轮 / 整段对话质量(情绪强化版 100 分15 个细项)。"""
model_config = ConfigDict(extra="ignore")
# 一、情绪价值与陪伴感30
emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
empathy_depth: float = Field(ge=0, le=8, description="共情深度")
emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
# 二、信息获取能力25
fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
# 三、人物建模能力15
persona_understanding: float = Field(ge=0, le=7, description="人物理解")
persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
# 四、结构化引导15
interview_structure: float = Field(ge=0, le=6, description="访谈结构")
context_memory: float = Field(ge=0, le=5, description="上下文记忆")
rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
# 五、提问质量15
question_quality: float = Field(ge=0, le=7, description="问题质量")
follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
non_leading: float = Field(ge=0, le=3, description="非引导性")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
major_strengths: list[str] = Field(default_factory=list)
major_issues: list[str] = Field(default_factory=list)
insufficient_evidence: list[str] = Field(default_factory=list)
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
# 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写)
emotion_score: float = Field(default=0, ge=0, le=30)
information_score: float = Field(default=0, ge=0, le=25)
persona_score: float = Field(default=0, ge=0, le=15)
structure_score: float = Field(default=0, ge=0, le=15)
question_score: float = Field(default=0, ge=0, le=15)
@model_validator(mode="before")
@classmethod
def _coerce_null_lists(cls, data: Any) -> Any:
if isinstance(data, dict):
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
data[key] = _coerce_judge_str_list(data.get(key))
if data.get("evidence_refs") is None:
data["evidence_refs"] = []
return data
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()
if s:
out.append(s[:max_chars])
return out
object.__setattr__(
self,
"major_strengths",
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
)
object.__setattr__(
self,
"major_issues",
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
)
object.__setattr__(
self,
"insufficient_evidence",
_cap_str_list(self.insufficient_evidence, max_items=10, max_chars=200),
)
refs = list(self.evidence_refs)[:12]
object.__setattr__(self, "evidence_refs", refs)
emotion = (
self.emotion_carry
+ self.empathy_depth
+ self.emotion_safety
+ self.emotion_guidance
)
information = (
self.fact_mining + self.info_completeness_guide + self.info_depth_mining
)
persona = (
self.persona_understanding
+ self.persona_consistency_verify
+ self.persona_expression_guide
)
structure = self.interview_structure + self.context_memory + self.rhythm_control
question = self.question_quality + self.follow_up_depth + self.non_leading
expected = emotion + information + persona + structure + question
# 细项为唯一事实来源LLM 常把 total_score 写成 100 与前四项打满但情绪块少 1 分等情况不一致
synced = max(0.0, min(100.0, round(float(expected), 2)))
object.__setattr__(self, "total_score", synced)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "persona_score", persona)
object.__setattr__(self, "structure_score", structure)
object.__setattr__(self, "question_score", question)
return self
# 整条 transcript 与单轮使用同一套细项
ConversationJudgeOutput = TurnJudgeOutput
class MemoirJudgeOutput(BaseModel):
"""成稿回忆录评分(总分 100子项上限见 rubric"""
model_config = ConfigDict(extra="ignore")
# 一、真实性与覆盖(小计最高 23由原 25 收紧)
mem_fidelity: float = Field(ge=0, le=9, description="记忆忠实度")
mem_factual_accuracy: float = Field(ge=0, le=5, description="事实准确性")
mem_factual_coverage: float = Field(ge=0, le=5, description="事实覆盖率")
mem_traceability: float = Field(ge=0, le=4, description="记忆可追溯性")
# 二、信息质量(小计最高 14由原 15 收紧)
info_slot_coverage: float = Field(ge=0, le=6, description="槽位覆盖度")
info_sufficiency: float = Field(ge=0, le=4, description="信息充分性")
info_density: float = Field(ge=0, le=4, description="信息密度")
# 三、叙事结构(小计最高 14由原 15 收紧)
narr_structure: float = Field(ge=0, le=6, description="故事结构")
narr_paragraphs: float = Field(ge=0, le=5, description="段落组织")
narr_pacing: float = Field(ge=0, le=3, description="节奏控制")
# 四、语言与文笔(小计最高 18由原 20 及六项上限一并收紧)
lang_fluency: float = Field(ge=0, le=3, description="语言流畅度")
lang_conciseness: float = Field(ge=0, le=3, description="表达精炼度")
lang_literary: float = Field(ge=0, le=4, description="文笔质量")
lang_controlled_expansion: float = Field(ge=0, le=4, description="控制性扩写能力")
lang_detail: float = Field(ge=0, le=2, description="细节还原与强化")
lang_style: float = Field(ge=0, le=2, description="风格一致性")
# 五、情感表达(小计最高 9由原 10 收紧)
emo_authenticity: float = Field(ge=0, le=5, description="情感真实度")
emo_depth: float = Field(ge=0, le=4, description="情感深度")
# 六、人物建模(小计最高 9由原 10 收紧)
char_understanding: float = Field(ge=0, le=4, description="人物理解")
char_consistency: float = Field(ge=0, le=3, description="人物一致性")
char_integration: float = Field(ge=0, le=2, description="人物融入度")
# 七、连贯性(小计最高 4由原 5 收紧)
coh_timeline: float = Field(ge=0, le=2, description="时间线一致性")
coh_cross_chapter: float = Field(ge=0, le=2, description="跨章节关联")
# 八、表达丰富度(小计最高 5
rich_analogy: float = Field(ge=0, le=3, description="类比与引用")
rich_diversity: float = Field(ge=0, le=2, description="表达多样性")
# 九、出版就绪度(小计最高 4由原 5 收紧)
pub_editorial_cost: float = Field(ge=0, le=2, description="编辑成本")
pub_completeness: float = Field(ge=0, le=2, description="完整度")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
major_strengths: list[str] = Field(default_factory=list)
major_issues: list[str] = Field(default_factory=list)
insufficient_evidence: list[str] = Field(default_factory=list)
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
authenticity_score: float = Field(default=0, ge=0, le=23)
information_score: float = Field(default=0, ge=0, le=14)
narrative_score: float = Field(default=0, ge=0, le=14)
language_score: float = Field(default=0, ge=0, le=18)
emotion_score: float = Field(default=0, ge=0, le=9)
character_score: float = Field(default=0, ge=0, le=9)
coherence_score: float = Field(default=0, ge=0, le=4)
richness_score: float = Field(default=0, ge=0, le=5)
publish_ready_score: float = Field(default=0, ge=0, le=4)
@model_validator(mode="before")
@classmethod
def _coerce_null_lists(cls, data: Any) -> Any:
if isinstance(data, dict):
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
data[key] = _coerce_judge_str_list(data.get(key))
if data.get("evidence_refs") is None:
data["evidence_refs"] = []
return data
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()
if s:
out.append(s[:max_chars])
return out
object.__setattr__(
self,
"major_strengths",
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
)
object.__setattr__(
self,
"major_issues",
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
)
object.__setattr__(
self,
"insufficient_evidence",
_cap_str_list(self.insufficient_evidence, max_items=12, max_chars=200),
)
refs = list(self.evidence_refs)[:12]
object.__setattr__(self, "evidence_refs", refs)
authenticity = (
self.mem_fidelity
+ self.mem_factual_accuracy
+ self.mem_factual_coverage
+ self.mem_traceability
)
information = (
self.info_slot_coverage + self.info_sufficiency + self.info_density
)
narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
language = (
self.lang_fluency
+ self.lang_conciseness
+ self.lang_literary
+ self.lang_controlled_expansion
+ self.lang_detail
+ self.lang_style
)
emotion = self.emo_authenticity + self.emo_depth
character = (
self.char_understanding + self.char_consistency + self.char_integration
)
coherence = self.coh_timeline + self.coh_cross_chapter
richness = self.rich_analogy + self.rich_diversity
publish = self.pub_editorial_cost + self.pub_completeness
expected = (
authenticity
+ information
+ narrative
+ language
+ emotion
+ character
+ coherence
+ richness
+ publish
)
synced = max(0.0, min(100.0, round(float(expected), 2)))
object.__setattr__(self, "total_score", synced)
object.__setattr__(self, "authenticity_score", authenticity)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "narrative_score", narrative)
object.__setattr__(self, "language_score", language)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "character_score", character)
object.__setattr__(self, "coherence_score", coherence)
object.__setattr__(self, "richness_score", richness)
object.__setattr__(self, "publish_ready_score", publish)
return self