"""评审 LLM 结构化输出(json_object)。 成稿(回忆录)子项上限已自洽为 **总分 100**(由原 110 分表等比例收紧整数档,见附件 rubric)。 """ from __future__ import annotations from typing import Any, Self from pydantic import BaseModel, ConfigDict, Field, model_validator class JudgeEvidenceRef(BaseModel): """评审引用:便于人工复核(对话 Turn 或成稿片段定位)。""" model_config = ConfigDict(extra="ignore") dimension: str = "" turn_index: int = Field(default=-1, ge=-1) snippet: str = Field(default="", max_length=400) def _is_judge_list_placeholder_empty(s: str) -> bool: """LLM 有时输出单句占位(如 'None identified.')而非 JSON 数组,按空列表处理。""" t = s.strip() if not t: return True tl = t.lower().rstrip(".") if tl in ( "none", "none identified", "n/a", "na", "-", "nil", "null", "no issues", "no issue", "not applicable", ): return True tc = t.rstrip("。") if tc in ("无", "暂无", "未发现", "没有"): return True return False def _safe_int_bounds(value: Any, *, default: int, ge: int, le: int) -> int: try: v = int(value) except (TypeError, ValueError): return default return max(ge, min(le, v)) def _coerce_judge_str_list(value: Any) -> list[Any]: """将评审 JSON 中的 list[str] 字段从 str / null 规范为列表(兼容 GLM-5 等输出的非数组形态)。""" if value is None: return [] if isinstance(value, list): return value if isinstance(value, str): s = value.strip() if _is_judge_list_placeholder_empty(s): return [] return [s] return [] class TurnJudgeOutput(BaseModel): """单轮 / 整段对话质量(情绪强化版 100 分,15 个细项)。""" model_config = ConfigDict(extra="ignore") # 一、情绪价值与陪伴感(30) emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力") empathy_depth: float = Field(ge=0, le=8, description="共情深度") emotion_safety: float = Field(ge=0, le=6, description="情绪安全感") emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力") # 二、信息获取能力(25) fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘") info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导") info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘") # 三、人物建模能力(15) persona_understanding: float = Field(ge=0, le=7, description="人物理解") persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证") persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导") # 四、结构化引导(15) interview_structure: float = Field(ge=0, le=6, description="访谈结构") context_memory: float = Field(ge=0, le=5, description="上下文记忆") rhythm_control: float = Field(ge=0, le=4, description="节奏控制") # 五、提问质量(15) question_quality: float = Field(ge=0, le=7, description="问题质量") follow_up_depth: float = Field(ge=0, le=5, description="追问能力") non_leading: float = Field(ge=0, le=3, description="非引导性") total_score: float = Field(ge=0, le=100) rationale: str = "" major_strengths: list[str] = Field(default_factory=list) major_issues: list[str] = Field(default_factory=list) insufficient_evidence: list[str] = Field(default_factory=list) evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list) confidence: float = Field(default=0.75, ge=0.0, le=1.0) # 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写) emotion_score: float = Field(default=0, ge=0, le=30) information_score: float = Field(default=0, ge=0, le=25) persona_score: float = Field(default=0, ge=0, le=15) structure_score: float = Field(default=0, ge=0, le=15) question_score: float = Field(default=0, ge=0, le=15) @model_validator(mode="before") @classmethod def _coerce_null_lists(cls, data: Any) -> Any: if isinstance(data, dict): for key in ("major_strengths", "major_issues", "insufficient_evidence"): data[key] = _coerce_judge_str_list(data.get(key)) if data.get("evidence_refs") is None: data["evidence_refs"] = [] return data @model_validator(mode="after") def _cap_meta_fields_and_sync_totals(self) -> Self: def _cap_str_list( xs: list[str], *, max_items: int, max_chars: int ) -> list[str]: out: list[str] = [] for x in xs[:max_items]: s = str(x).strip() if s: out.append(s[:max_chars]) return out object.__setattr__( self, "major_strengths", _cap_str_list(self.major_strengths, max_items=8, max_chars=200), ) object.__setattr__( self, "major_issues", _cap_str_list(self.major_issues, max_items=10, max_chars=200), ) object.__setattr__( self, "insufficient_evidence", _cap_str_list(self.insufficient_evidence, max_items=10, max_chars=200), ) refs = list(self.evidence_refs)[:12] object.__setattr__(self, "evidence_refs", refs) emotion = ( self.emotion_carry + self.empathy_depth + self.emotion_safety + self.emotion_guidance ) information = ( self.fact_mining + self.info_completeness_guide + self.info_depth_mining ) persona = ( self.persona_understanding + self.persona_consistency_verify + self.persona_expression_guide ) structure = self.interview_structure + self.context_memory + self.rhythm_control question = self.question_quality + self.follow_up_depth + self.non_leading expected = emotion + information + persona + structure + question # 细项为唯一事实来源:LLM 常把 total_score 写成 100 与前四项打满但情绪块少 1 分等情况不一致 synced = max(0.0, min(100.0, round(float(expected), 2))) object.__setattr__(self, "total_score", synced) object.__setattr__(self, "emotion_score", emotion) object.__setattr__(self, "information_score", information) object.__setattr__(self, "persona_score", persona) object.__setattr__(self, "structure_score", structure) object.__setattr__(self, "question_score", question) return self # 整条 transcript 与单轮使用同一套细项 ConversationJudgeOutput = TurnJudgeOutput # 评审 LLM 常把细项打成「略超满分」的浮点;先钳制再校验,避免整 JSON 丢弃。 _MEMOIR_LEAF_SCORE_BOUNDS: dict[str, tuple[float, float]] = { "mem_fidelity": (0, 9), "mem_factual_accuracy": (0, 5), "mem_factual_coverage": (0, 5), "mem_traceability": (0, 4), "info_slot_coverage": (0, 6), "info_sufficiency": (0, 4), "info_density": (0, 4), "narr_structure": (0, 6), "narr_paragraphs": (0, 5), "narr_pacing": (0, 3), "lang_fluency": (0, 3), "lang_conciseness": (0, 3), "lang_literary": (0, 4), "lang_controlled_expansion": (0, 4), "lang_detail": (0, 2), "lang_style": (0, 2), "emo_authenticity": (0, 5), "emo_depth": (0, 4), "char_understanding": (0, 4), "char_consistency": (0, 3), "char_integration": (0, 2), "coh_timeline": (0, 2), "coh_cross_chapter": (0, 2), "rich_analogy": (0, 3), "rich_diversity": (0, 2), "pub_editorial_cost": (0, 2), "pub_completeness": (0, 2), } class MemoirJudgeOutput(BaseModel): """成稿回忆录评分(总分 100,子项上限见 rubric)。 产品优先保留 **文字**(对照说明、改进建议):细项分值允许模型乱写,入模时先放宽到 ``0–100``,再在 ``mode=\"after\"`` 中按 rubric 上限钳制并重算 total,避免因分数校验丢整段 JSON。 """ model_config = ConfigDict(extra="ignore") # 细项:校验放宽到 0–100;真实满分仍以 rubric 为准,由 after 钳制 mem_fidelity: float = Field(default=0, ge=0, le=100, description="记忆忠实度") mem_factual_accuracy: float = Field( default=0, ge=0, le=100, description="事实准确性" ) mem_factual_coverage: float = Field( default=0, ge=0, le=100, description="事实覆盖率" ) mem_traceability: float = Field(default=0, ge=0, le=100, description="记忆可追溯性") info_slot_coverage: float = Field(default=0, ge=0, le=100, description="槽位覆盖度") info_sufficiency: float = Field(default=0, ge=0, le=100, description="信息充分性") info_density: float = Field(default=0, ge=0, le=100, description="信息密度") narr_structure: float = Field(default=0, ge=0, le=100, description="故事结构") narr_paragraphs: float = Field(default=0, ge=0, le=100, description="段落组织") narr_pacing: float = Field(default=0, ge=0, le=100, description="节奏控制") lang_fluency: float = Field(default=0, ge=0, le=100, description="语言流畅度") lang_conciseness: float = Field(default=0, ge=0, le=100, description="表达精炼度") lang_literary: float = Field(default=0, ge=0, le=100, description="文笔质量") lang_controlled_expansion: float = Field( default=0, ge=0, le=100, description="控制性扩写能力" ) lang_detail: float = Field(default=0, ge=0, le=100, description="细节还原与强化") lang_style: float = Field(default=0, ge=0, le=100, description="风格一致性") emo_authenticity: float = Field(default=0, ge=0, le=100, description="情感真实度") emo_depth: float = Field(default=0, ge=0, le=100, description="情感深度") char_understanding: float = Field(default=0, ge=0, le=100, description="人物理解") char_consistency: float = Field(default=0, ge=0, le=100, description="人物一致性") char_integration: float = Field(default=0, ge=0, le=100, description="人物融入度") coh_timeline: float = Field(default=0, ge=0, le=100, description="时间线一致性") coh_cross_chapter: float = Field(default=0, ge=0, le=100, description="跨章节关联") rich_analogy: float = Field(default=0, ge=0, le=100, description="类比与引用") rich_diversity: float = Field(default=0, ge=0, le=100, description="表达多样性") pub_editorial_cost: float = Field(default=0, ge=0, le=100, description="编辑成本") pub_completeness: float = Field(default=0, ge=0, le=100, description="完整度") total_score: float = Field(default=0, ge=0, le=100) rationale: str = "" major_strengths: list[str] = Field(default_factory=list) major_issues: list[str] = Field(default_factory=list) insufficient_evidence: list[str] = Field(default_factory=list) evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list) confidence: float = Field(default=0.75, ge=0.0, le=1.0) authenticity_score: float = Field(default=0, ge=0, le=100) information_score: float = Field(default=0, ge=0, le=100) narrative_score: float = Field(default=0, ge=0, le=100) language_score: float = Field(default=0, ge=0, le=100) emotion_score: float = Field(default=0, ge=0, le=100) character_score: float = Field(default=0, ge=0, le=100) coherence_score: float = Field(default=0, ge=0, le=100) richness_score: float = Field(default=0, ge=0, le=100) publish_ready_score: float = Field(default=0, ge=0, le=100) @model_validator(mode="before") @classmethod def _coerce_memoir_judge_input(cls, data: Any) -> Any: if not isinstance(data, dict): return data data["rationale"] = ( "" if data.get("rationale") is None else str(data["rationale"]) ) for key in ("major_strengths", "major_issues", "insufficient_evidence"): data[key] = _coerce_judge_str_list(data.get(key)) raw_refs = data.get("evidence_refs") if not isinstance(raw_refs, list): data["evidence_refs"] = [] else: clean: list[dict[str, Any]] = [] for item in raw_refs: if not isinstance(item, dict): continue clean.append( { "dimension": str(item.get("dimension", ""))[:200], "turn_index": _safe_int_bounds( item.get("turn_index"), default=-1, ge=-1, le=500_000 ), "snippet": str(item.get("snippet", ""))[:400], } ) data["evidence_refs"] = clean def _loose_score(v: Any) -> float: if v is None: return 0.0 try: x = float(v) except (TypeError, ValueError): return 0.0 if x != x or x in (float("inf"), float("-inf")): return 0.0 return max(0.0, min(100.0, x)) for fname in _MEMOIR_LEAF_SCORE_BOUNDS: data[fname] = _loose_score(data.get(fname)) _agg_keys = ( "authenticity_score", "information_score", "narrative_score", "language_score", "emotion_score", "character_score", "coherence_score", "richness_score", "publish_ready_score", "total_score", ) for fname in _agg_keys: if fname not in data or data[fname] is None: continue data[fname] = _loose_score(data[fname]) if "confidence" in data and data["confidence"] is not None: try: c = float(data["confidence"]) if c != c: raise ValueError data["confidence"] = max(0.0, min(1.0, c)) except (TypeError, ValueError): del data["confidence"] return data @model_validator(mode="after") def _cap_meta_fields_and_sync_totals(self) -> Self: def _cap_str_list( xs: list[str], *, max_items: int, max_chars: int ) -> list[str]: out: list[str] = [] for x in xs[:max_items]: s = str(x).strip() if s: out.append(s[:max_chars]) return out object.__setattr__( self, "major_strengths", _cap_str_list(self.major_strengths, max_items=8, max_chars=200), ) object.__setattr__( self, "major_issues", _cap_str_list(self.major_issues, max_items=10, max_chars=200), ) object.__setattr__( self, "insufficient_evidence", _cap_str_list(self.insufficient_evidence, max_items=12, max_chars=200), ) refs = list(self.evidence_refs)[:12] object.__setattr__(self, "evidence_refs", refs) for fname, (lo, hi) in _MEMOIR_LEAF_SCORE_BOUNDS.items(): try: raw = float(getattr(self, fname)) except (TypeError, ValueError): raw = 0.0 object.__setattr__(self, fname, max(lo, min(hi, raw))) authenticity = ( self.mem_fidelity + self.mem_factual_accuracy + self.mem_factual_coverage + self.mem_traceability ) information = ( self.info_slot_coverage + self.info_sufficiency + self.info_density ) narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing language = ( self.lang_fluency + self.lang_conciseness + self.lang_literary + self.lang_controlled_expansion + self.lang_detail + self.lang_style ) emotion = self.emo_authenticity + self.emo_depth character = ( self.char_understanding + self.char_consistency + self.char_integration ) coherence = self.coh_timeline + self.coh_cross_chapter richness = self.rich_analogy + self.rich_diversity publish = self.pub_editorial_cost + self.pub_completeness expected = ( authenticity + information + narrative + language + emotion + character + coherence + richness + publish ) synced = max(0.0, min(100.0, round(float(expected), 2))) object.__setattr__(self, "total_score", synced) object.__setattr__(self, "authenticity_score", authenticity) object.__setattr__(self, "information_score", information) object.__setattr__(self, "narrative_score", narrative) object.__setattr__(self, "language_score", language) object.__setattr__(self, "emotion_score", emotion) object.__setattr__(self, "character_score", character) object.__setattr__(self, "coherence_score", coherence) object.__setattr__(self, "richness_score", richness) object.__setattr__(self, "publish_ready_score", publish) return self