Files
life-echo/api/app/features/evaluation/judge_schemas.py
yangshilin 17b9fa3466 fix:
1. 修复登录界面文字被遮挡问题
2. 大字模式关闭后显示异常问题
3. 重新调整大字模式是否开启时的字体显示效果
2026-04-10 20:35:57 +08:00

445 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""评审 LLM 结构化输出json_object
成稿(回忆录)子项上限已自洽为 **总分 100**(由原 110 分表等比例收紧整数档,见附件 rubric
"""
from __future__ import annotations
from typing import Any, Self
from pydantic import BaseModel, ConfigDict, Field, model_validator
class JudgeEvidenceRef(BaseModel):
"""评审引用:便于人工复核(对话 Turn 或成稿片段定位)。"""
model_config = ConfigDict(extra="ignore")
dimension: str = ""
turn_index: int = Field(default=-1, ge=-1)
snippet: str = Field(default="", max_length=400)
def _is_judge_list_placeholder_empty(s: str) -> bool:
"""LLM 有时输出单句占位(如 'None identified.')而非 JSON 数组,按空列表处理。"""
t = s.strip()
if not t:
return True
tl = t.lower().rstrip(".")
if tl in (
"none",
"none identified",
"n/a",
"na",
"-",
"nil",
"null",
"no issues",
"no issue",
"not applicable",
):
return True
tc = t.rstrip("")
if tc in ("", "暂无", "未发现", "没有"):
return True
return False
def _safe_int_bounds(value: Any, *, default: int, ge: int, le: int) -> int:
try:
v = int(value)
except (TypeError, ValueError):
return default
return max(ge, min(le, v))
def _coerce_judge_str_list(value: Any) -> list[Any]:
"""将评审 JSON 中的 list[str] 字段从 str / null 规范为列表(兼容 GLM-5 等输出的非数组形态)。"""
if value is None:
return []
if isinstance(value, list):
return value
if isinstance(value, str):
s = value.strip()
if _is_judge_list_placeholder_empty(s):
return []
return [s]
return []
class TurnJudgeOutput(BaseModel):
"""单轮 / 整段对话质量(情绪强化版 100 分15 个细项)。"""
model_config = ConfigDict(extra="ignore")
# 一、情绪价值与陪伴感30
emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
empathy_depth: float = Field(ge=0, le=8, description="共情深度")
emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
# 二、信息获取能力25
fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
# 三、人物建模能力15
persona_understanding: float = Field(ge=0, le=7, description="人物理解")
persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
# 四、结构化引导15
interview_structure: float = Field(ge=0, le=6, description="访谈结构")
context_memory: float = Field(ge=0, le=5, description="上下文记忆")
rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
# 五、提问质量15
question_quality: float = Field(ge=0, le=7, description="问题质量")
follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
non_leading: float = Field(ge=0, le=3, description="非引导性")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
major_strengths: list[str] = Field(default_factory=list)
major_issues: list[str] = Field(default_factory=list)
insufficient_evidence: list[str] = Field(default_factory=list)
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
# 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写)
emotion_score: float = Field(default=0, ge=0, le=30)
information_score: float = Field(default=0, ge=0, le=25)
persona_score: float = Field(default=0, ge=0, le=15)
structure_score: float = Field(default=0, ge=0, le=15)
question_score: float = Field(default=0, ge=0, le=15)
@model_validator(mode="before")
@classmethod
def _coerce_null_lists(cls, data: Any) -> Any:
if isinstance(data, dict):
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
data[key] = _coerce_judge_str_list(data.get(key))
if data.get("evidence_refs") is None:
data["evidence_refs"] = []
return data
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(
xs: list[str], *, max_items: int, max_chars: int
) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()
if s:
out.append(s[:max_chars])
return out
object.__setattr__(
self,
"major_strengths",
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
)
object.__setattr__(
self,
"major_issues",
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
)
object.__setattr__(
self,
"insufficient_evidence",
_cap_str_list(self.insufficient_evidence, max_items=10, max_chars=200),
)
refs = list(self.evidence_refs)[:12]
object.__setattr__(self, "evidence_refs", refs)
emotion = (
self.emotion_carry
+ self.empathy_depth
+ self.emotion_safety
+ self.emotion_guidance
)
information = (
self.fact_mining + self.info_completeness_guide + self.info_depth_mining
)
persona = (
self.persona_understanding
+ self.persona_consistency_verify
+ self.persona_expression_guide
)
structure = self.interview_structure + self.context_memory + self.rhythm_control
question = self.question_quality + self.follow_up_depth + self.non_leading
expected = emotion + information + persona + structure + question
# 细项为唯一事实来源LLM 常把 total_score 写成 100 与前四项打满但情绪块少 1 分等情况不一致
synced = max(0.0, min(100.0, round(float(expected), 2)))
object.__setattr__(self, "total_score", synced)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "persona_score", persona)
object.__setattr__(self, "structure_score", structure)
object.__setattr__(self, "question_score", question)
return self
# 整条 transcript 与单轮使用同一套细项
ConversationJudgeOutput = TurnJudgeOutput
# 评审 LLM 常把细项打成「略超满分」的浮点;先钳制再校验,避免整 JSON 丢弃。
_MEMOIR_LEAF_SCORE_BOUNDS: dict[str, tuple[float, float]] = {
"mem_fidelity": (0, 9),
"mem_factual_accuracy": (0, 5),
"mem_factual_coverage": (0, 5),
"mem_traceability": (0, 4),
"info_slot_coverage": (0, 6),
"info_sufficiency": (0, 4),
"info_density": (0, 4),
"narr_structure": (0, 6),
"narr_paragraphs": (0, 5),
"narr_pacing": (0, 3),
"lang_fluency": (0, 3),
"lang_conciseness": (0, 3),
"lang_literary": (0, 4),
"lang_controlled_expansion": (0, 4),
"lang_detail": (0, 2),
"lang_style": (0, 2),
"emo_authenticity": (0, 5),
"emo_depth": (0, 4),
"char_understanding": (0, 4),
"char_consistency": (0, 3),
"char_integration": (0, 2),
"coh_timeline": (0, 2),
"coh_cross_chapter": (0, 2),
"rich_analogy": (0, 3),
"rich_diversity": (0, 2),
"pub_editorial_cost": (0, 2),
"pub_completeness": (0, 2),
}
class MemoirJudgeOutput(BaseModel):
"""成稿回忆录评分(总分 100子项上限见 rubric
产品优先保留 **文字**(对照说明、改进建议):细项分值允许模型乱写,入模时先放宽到
``0100``,再在 ``mode=\"after\"`` 中按 rubric 上限钳制并重算 total避免因分数校验丢整段 JSON。
"""
model_config = ConfigDict(extra="ignore")
# 细项:校验放宽到 0100真实满分仍以 rubric 为准,由 after 钳制
mem_fidelity: float = Field(default=0, ge=0, le=100, description="记忆忠实度")
mem_factual_accuracy: float = Field(
default=0, ge=0, le=100, description="事实准确性"
)
mem_factual_coverage: float = Field(
default=0, ge=0, le=100, description="事实覆盖率"
)
mem_traceability: float = Field(default=0, ge=0, le=100, description="记忆可追溯性")
info_slot_coverage: float = Field(default=0, ge=0, le=100, description="槽位覆盖度")
info_sufficiency: float = Field(default=0, ge=0, le=100, description="信息充分性")
info_density: float = Field(default=0, ge=0, le=100, description="信息密度")
narr_structure: float = Field(default=0, ge=0, le=100, description="故事结构")
narr_paragraphs: float = Field(default=0, ge=0, le=100, description="段落组织")
narr_pacing: float = Field(default=0, ge=0, le=100, description="节奏控制")
lang_fluency: float = Field(default=0, ge=0, le=100, description="语言流畅度")
lang_conciseness: float = Field(default=0, ge=0, le=100, description="表达精炼度")
lang_literary: float = Field(default=0, ge=0, le=100, description="文笔质量")
lang_controlled_expansion: float = Field(
default=0, ge=0, le=100, description="控制性扩写能力"
)
lang_detail: float = Field(default=0, ge=0, le=100, description="细节还原与强化")
lang_style: float = Field(default=0, ge=0, le=100, description="风格一致性")
emo_authenticity: float = Field(default=0, ge=0, le=100, description="情感真实度")
emo_depth: float = Field(default=0, ge=0, le=100, description="情感深度")
char_understanding: float = Field(default=0, ge=0, le=100, description="人物理解")
char_consistency: float = Field(default=0, ge=0, le=100, description="人物一致性")
char_integration: float = Field(default=0, ge=0, le=100, description="人物融入度")
coh_timeline: float = Field(default=0, ge=0, le=100, description="时间线一致性")
coh_cross_chapter: float = Field(default=0, ge=0, le=100, description="跨章节关联")
rich_analogy: float = Field(default=0, ge=0, le=100, description="类比与引用")
rich_diversity: float = Field(default=0, ge=0, le=100, description="表达多样性")
pub_editorial_cost: float = Field(default=0, ge=0, le=100, description="编辑成本")
pub_completeness: float = Field(default=0, ge=0, le=100, description="完整度")
total_score: float = Field(default=0, ge=0, le=100)
rationale: str = ""
major_strengths: list[str] = Field(default_factory=list)
major_issues: list[str] = Field(default_factory=list)
insufficient_evidence: list[str] = Field(default_factory=list)
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
authenticity_score: float = Field(default=0, ge=0, le=100)
information_score: float = Field(default=0, ge=0, le=100)
narrative_score: float = Field(default=0, ge=0, le=100)
language_score: float = Field(default=0, ge=0, le=100)
emotion_score: float = Field(default=0, ge=0, le=100)
character_score: float = Field(default=0, ge=0, le=100)
coherence_score: float = Field(default=0, ge=0, le=100)
richness_score: float = Field(default=0, ge=0, le=100)
publish_ready_score: float = Field(default=0, ge=0, le=100)
@model_validator(mode="before")
@classmethod
def _coerce_memoir_judge_input(cls, data: Any) -> Any:
if not isinstance(data, dict):
return data
data["rationale"] = (
"" if data.get("rationale") is None else str(data["rationale"])
)
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
data[key] = _coerce_judge_str_list(data.get(key))
raw_refs = data.get("evidence_refs")
if not isinstance(raw_refs, list):
data["evidence_refs"] = []
else:
clean: list[dict[str, Any]] = []
for item in raw_refs:
if not isinstance(item, dict):
continue
clean.append(
{
"dimension": str(item.get("dimension", ""))[:200],
"turn_index": _safe_int_bounds(
item.get("turn_index"), default=-1, ge=-1, le=500_000
),
"snippet": str(item.get("snippet", ""))[:400],
}
)
data["evidence_refs"] = clean
def _loose_score(v: Any) -> float:
if v is None:
return 0.0
try:
x = float(v)
except (TypeError, ValueError):
return 0.0
if x != x or x in (float("inf"), float("-inf")):
return 0.0
return max(0.0, min(100.0, x))
for fname in _MEMOIR_LEAF_SCORE_BOUNDS:
data[fname] = _loose_score(data.get(fname))
_agg_keys = (
"authenticity_score",
"information_score",
"narrative_score",
"language_score",
"emotion_score",
"character_score",
"coherence_score",
"richness_score",
"publish_ready_score",
"total_score",
)
for fname in _agg_keys:
if fname not in data or data[fname] is None:
continue
data[fname] = _loose_score(data[fname])
if "confidence" in data and data["confidence"] is not None:
try:
c = float(data["confidence"])
if c != c:
raise ValueError
data["confidence"] = max(0.0, min(1.0, c))
except (TypeError, ValueError):
del data["confidence"]
return data
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(
xs: list[str], *, max_items: int, max_chars: int
) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()
if s:
out.append(s[:max_chars])
return out
object.__setattr__(
self,
"major_strengths",
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
)
object.__setattr__(
self,
"major_issues",
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
)
object.__setattr__(
self,
"insufficient_evidence",
_cap_str_list(self.insufficient_evidence, max_items=12, max_chars=200),
)
refs = list(self.evidence_refs)[:12]
object.__setattr__(self, "evidence_refs", refs)
for fname, (lo, hi) in _MEMOIR_LEAF_SCORE_BOUNDS.items():
try:
raw = float(getattr(self, fname))
except (TypeError, ValueError):
raw = 0.0
object.__setattr__(self, fname, max(lo, min(hi, raw)))
authenticity = (
self.mem_fidelity
+ self.mem_factual_accuracy
+ self.mem_factual_coverage
+ self.mem_traceability
)
information = (
self.info_slot_coverage + self.info_sufficiency + self.info_density
)
narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
language = (
self.lang_fluency
+ self.lang_conciseness
+ self.lang_literary
+ self.lang_controlled_expansion
+ self.lang_detail
+ self.lang_style
)
emotion = self.emo_authenticity + self.emo_depth
character = (
self.char_understanding + self.char_consistency + self.char_integration
)
coherence = self.coh_timeline + self.coh_cross_chapter
richness = self.rich_analogy + self.rich_diversity
publish = self.pub_editorial_cost + self.pub_completeness
expected = (
authenticity
+ information
+ narrative
+ language
+ emotion
+ character
+ coherence
+ richness
+ publish
)
synced = max(0.0, min(100.0, round(float(expected), 2)))
object.__setattr__(self, "total_score", synced)
object.__setattr__(self, "authenticity_score", authenticity)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "narrative_score", narrative)
object.__setattr__(self, "language_score", language)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "character_score", character)
object.__setattr__(self, "coherence_score", coherence)
object.__setattr__(self, "richness_score", richness)
object.__setattr__(self, "publish_ready_score", publish)
return self