2026-04-07 10:34:59 +08:00
|
|
|
|
"""评审 LLM 结构化输出(json_object)。
|
|
|
|
|
|
|
|
|
|
|
|
成稿(回忆录)子项上限已自洽为 **总分 100**(由原 110 分表等比例收紧整数档,见附件 rubric)。
|
|
|
|
|
|
"""
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
from typing import Any, Self
|
2026-04-07 10:34:59 +08:00
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class JudgeEvidenceRef(BaseModel):
|
|
|
|
|
|
"""评审引用:便于人工复核(对话 Turn 或成稿片段定位)。"""
|
|
|
|
|
|
|
|
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
|
|
|
|
|
|
|
|
dimension: str = ""
|
|
|
|
|
|
turn_index: int = Field(default=-1, ge=-1)
|
|
|
|
|
|
snippet: str = Field(default="", max_length=400)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 09:38:07 +08:00
|
|
|
|
def _is_judge_list_placeholder_empty(s: str) -> bool:
|
|
|
|
|
|
"""LLM 有时输出单句占位(如 'None identified.')而非 JSON 数组,按空列表处理。"""
|
|
|
|
|
|
t = s.strip()
|
|
|
|
|
|
if not t:
|
|
|
|
|
|
return True
|
|
|
|
|
|
tl = t.lower().rstrip(".")
|
|
|
|
|
|
if tl in (
|
|
|
|
|
|
"none",
|
|
|
|
|
|
"none identified",
|
|
|
|
|
|
"n/a",
|
|
|
|
|
|
"na",
|
|
|
|
|
|
"-",
|
|
|
|
|
|
"nil",
|
|
|
|
|
|
"null",
|
|
|
|
|
|
"no issues",
|
|
|
|
|
|
"no issue",
|
|
|
|
|
|
"not applicable",
|
|
|
|
|
|
):
|
|
|
|
|
|
return True
|
|
|
|
|
|
tc = t.rstrip("。")
|
|
|
|
|
|
if tc in ("无", "暂无", "未发现", "没有"):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
def _safe_int_bounds(value: Any, *, default: int, ge: int, le: int) -> int:
|
|
|
|
|
|
try:
|
|
|
|
|
|
v = int(value)
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
return default
|
|
|
|
|
|
return max(ge, min(le, v))
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-08 09:38:07 +08:00
|
|
|
|
def _coerce_judge_str_list(value: Any) -> list[Any]:
|
|
|
|
|
|
"""将评审 JSON 中的 list[str] 字段从 str / null 规范为列表(兼容 GLM-5 等输出的非数组形态)。"""
|
|
|
|
|
|
if value is None:
|
|
|
|
|
|
return []
|
|
|
|
|
|
if isinstance(value, list):
|
|
|
|
|
|
return value
|
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
|
s = value.strip()
|
|
|
|
|
|
if _is_judge_list_placeholder_empty(s):
|
|
|
|
|
|
return []
|
|
|
|
|
|
return [s]
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-03 14:44:46 +08:00
|
|
|
|
class TurnJudgeOutput(BaseModel):
|
2026-04-07 10:34:59 +08:00
|
|
|
|
"""单轮 / 整段对话质量(情绪强化版 100 分,15 个细项)。"""
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
# 一、情绪价值与陪伴感(30)
|
|
|
|
|
|
emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
|
|
|
|
|
|
empathy_depth: float = Field(ge=0, le=8, description="共情深度")
|
|
|
|
|
|
emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
|
|
|
|
|
|
emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
|
|
|
|
|
|
|
|
|
|
|
|
# 二、信息获取能力(25)
|
|
|
|
|
|
fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
|
|
|
|
|
|
info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
|
|
|
|
|
|
info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
|
|
|
|
|
|
|
|
|
|
|
|
# 三、人物建模能力(15)
|
|
|
|
|
|
persona_understanding: float = Field(ge=0, le=7, description="人物理解")
|
|
|
|
|
|
persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
|
|
|
|
|
|
persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
# 四、结构化引导(15)
|
|
|
|
|
|
interview_structure: float = Field(ge=0, le=6, description="访谈结构")
|
|
|
|
|
|
context_memory: float = Field(ge=0, le=5, description="上下文记忆")
|
|
|
|
|
|
rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
# 五、提问质量(15)
|
|
|
|
|
|
question_quality: float = Field(ge=0, le=7, description="问题质量")
|
|
|
|
|
|
follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
|
|
|
|
|
|
non_leading: float = Field(ge=0, le=3, description="非引导性")
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
|
|
|
|
|
total_score: float = Field(ge=0, le=100)
|
|
|
|
|
|
rationale: str = ""
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
major_strengths: list[str] = Field(default_factory=list)
|
|
|
|
|
|
major_issues: list[str] = Field(default_factory=list)
|
|
|
|
|
|
insufficient_evidence: list[str] = Field(default_factory=list)
|
|
|
|
|
|
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
|
|
|
|
|
|
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
# 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写)
|
|
|
|
|
|
emotion_score: float = Field(default=0, ge=0, le=30)
|
|
|
|
|
|
information_score: float = Field(default=0, ge=0, le=25)
|
|
|
|
|
|
persona_score: float = Field(default=0, ge=0, le=15)
|
|
|
|
|
|
structure_score: float = Field(default=0, ge=0, le=15)
|
|
|
|
|
|
question_score: float = Field(default=0, ge=0, le=15)
|
|
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
@model_validator(mode="before")
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
def _coerce_null_lists(cls, data: Any) -> Any:
|
|
|
|
|
|
if isinstance(data, dict):
|
|
|
|
|
|
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
|
2026-04-08 09:38:07 +08:00
|
|
|
|
data[key] = _coerce_judge_str_list(data.get(key))
|
2026-04-07 17:15:01 +08:00
|
|
|
|
if data.get("evidence_refs") is None:
|
|
|
|
|
|
data["evidence_refs"] = []
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
@model_validator(mode="after")
|
2026-04-07 17:15:01 +08:00
|
|
|
|
def _cap_meta_fields_and_sync_totals(self) -> Self:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
def _cap_str_list(
|
|
|
|
|
|
xs: list[str], *, max_items: int, max_chars: int
|
|
|
|
|
|
) -> list[str]:
|
2026-04-07 17:15:01 +08:00
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
for x in xs[:max_items]:
|
|
|
|
|
|
s = str(x).strip()
|
|
|
|
|
|
if s:
|
|
|
|
|
|
out.append(s[:max_chars])
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
object.__setattr__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
"major_strengths",
|
|
|
|
|
|
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
|
|
|
|
|
|
)
|
|
|
|
|
|
object.__setattr__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
"major_issues",
|
|
|
|
|
|
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
|
|
|
|
|
|
)
|
|
|
|
|
|
object.__setattr__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
"insufficient_evidence",
|
|
|
|
|
|
_cap_str_list(self.insufficient_evidence, max_items=10, max_chars=200),
|
|
|
|
|
|
)
|
|
|
|
|
|
refs = list(self.evidence_refs)[:12]
|
|
|
|
|
|
object.__setattr__(self, "evidence_refs", refs)
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
emotion = (
|
|
|
|
|
|
self.emotion_carry
|
|
|
|
|
|
+ self.empathy_depth
|
|
|
|
|
|
+ self.emotion_safety
|
|
|
|
|
|
+ self.emotion_guidance
|
|
|
|
|
|
)
|
|
|
|
|
|
information = (
|
|
|
|
|
|
self.fact_mining + self.info_completeness_guide + self.info_depth_mining
|
|
|
|
|
|
)
|
|
|
|
|
|
persona = (
|
|
|
|
|
|
self.persona_understanding
|
|
|
|
|
|
+ self.persona_consistency_verify
|
|
|
|
|
|
+ self.persona_expression_guide
|
|
|
|
|
|
)
|
|
|
|
|
|
structure = self.interview_structure + self.context_memory + self.rhythm_control
|
|
|
|
|
|
question = self.question_quality + self.follow_up_depth + self.non_leading
|
|
|
|
|
|
expected = emotion + information + persona + structure + question
|
2026-04-08 09:38:07 +08:00
|
|
|
|
# 细项为唯一事实来源:LLM 常把 total_score 写成 100 与前四项打满但情绪块少 1 分等情况不一致
|
|
|
|
|
|
synced = max(0.0, min(100.0, round(float(expected), 2)))
|
|
|
|
|
|
object.__setattr__(self, "total_score", synced)
|
2026-04-07 10:34:59 +08:00
|
|
|
|
object.__setattr__(self, "emotion_score", emotion)
|
|
|
|
|
|
object.__setattr__(self, "information_score", information)
|
|
|
|
|
|
object.__setattr__(self, "persona_score", persona)
|
|
|
|
|
|
object.__setattr__(self, "structure_score", structure)
|
|
|
|
|
|
object.__setattr__(self, "question_score", question)
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 整条 transcript 与单轮使用同一套细项
|
|
|
|
|
|
ConversationJudgeOutput = TurnJudgeOutput
|
|
|
|
|
|
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
# 评审 LLM 常把细项打成「略超满分」的浮点;先钳制再校验,避免整 JSON 丢弃。
|
|
|
|
|
|
_MEMOIR_LEAF_SCORE_BOUNDS: dict[str, tuple[float, float]] = {
|
|
|
|
|
|
"mem_fidelity": (0, 9),
|
|
|
|
|
|
"mem_factual_accuracy": (0, 5),
|
|
|
|
|
|
"mem_factual_coverage": (0, 5),
|
|
|
|
|
|
"mem_traceability": (0, 4),
|
|
|
|
|
|
"info_slot_coverage": (0, 6),
|
|
|
|
|
|
"info_sufficiency": (0, 4),
|
|
|
|
|
|
"info_density": (0, 4),
|
|
|
|
|
|
"narr_structure": (0, 6),
|
|
|
|
|
|
"narr_paragraphs": (0, 5),
|
|
|
|
|
|
"narr_pacing": (0, 3),
|
|
|
|
|
|
"lang_fluency": (0, 3),
|
|
|
|
|
|
"lang_conciseness": (0, 3),
|
|
|
|
|
|
"lang_literary": (0, 4),
|
|
|
|
|
|
"lang_controlled_expansion": (0, 4),
|
|
|
|
|
|
"lang_detail": (0, 2),
|
|
|
|
|
|
"lang_style": (0, 2),
|
|
|
|
|
|
"emo_authenticity": (0, 5),
|
|
|
|
|
|
"emo_depth": (0, 4),
|
|
|
|
|
|
"char_understanding": (0, 4),
|
|
|
|
|
|
"char_consistency": (0, 3),
|
|
|
|
|
|
"char_integration": (0, 2),
|
|
|
|
|
|
"coh_timeline": (0, 2),
|
|
|
|
|
|
"coh_cross_chapter": (0, 2),
|
|
|
|
|
|
"rich_analogy": (0, 3),
|
|
|
|
|
|
"rich_diversity": (0, 2),
|
|
|
|
|
|
"pub_editorial_cost": (0, 2),
|
|
|
|
|
|
"pub_completeness": (0, 2),
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-03 14:44:46 +08:00
|
|
|
|
class MemoirJudgeOutput(BaseModel):
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
"""成稿回忆录评分(总分 100,子项上限见 rubric)。
|
|
|
|
|
|
|
|
|
|
|
|
产品优先保留 **文字**(对照说明、改进建议):细项分值允许模型乱写,入模时先放宽到
|
|
|
|
|
|
``0–100``,再在 ``mode=\"after\"`` 中按 rubric 上限钳制并重算 total,避免因分数校验丢整段 JSON。
|
|
|
|
|
|
"""
|
2026-04-07 10:34:59 +08:00
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
|
|
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
# 细项:校验放宽到 0–100;真实满分仍以 rubric 为准,由 after 钳制
|
|
|
|
|
|
mem_fidelity: float = Field(default=0, ge=0, le=100, description="记忆忠实度")
|
2026-04-10 20:35:57 +08:00
|
|
|
|
mem_factual_accuracy: float = Field(
|
|
|
|
|
|
default=0, ge=0, le=100, description="事实准确性"
|
|
|
|
|
|
)
|
|
|
|
|
|
mem_factual_coverage: float = Field(
|
|
|
|
|
|
default=0, ge=0, le=100, description="事实覆盖率"
|
|
|
|
|
|
)
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
mem_traceability: float = Field(default=0, ge=0, le=100, description="记忆可追溯性")
|
2026-04-03 14:44:46 +08:00
|
|
|
|
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
info_slot_coverage: float = Field(default=0, ge=0, le=100, description="槽位覆盖度")
|
|
|
|
|
|
info_sufficiency: float = Field(default=0, ge=0, le=100, description="信息充分性")
|
|
|
|
|
|
info_density: float = Field(default=0, ge=0, le=100, description="信息密度")
|
|
|
|
|
|
|
|
|
|
|
|
narr_structure: float = Field(default=0, ge=0, le=100, description="故事结构")
|
|
|
|
|
|
narr_paragraphs: float = Field(default=0, ge=0, le=100, description="段落组织")
|
|
|
|
|
|
narr_pacing: float = Field(default=0, ge=0, le=100, description="节奏控制")
|
|
|
|
|
|
|
|
|
|
|
|
lang_fluency: float = Field(default=0, ge=0, le=100, description="语言流畅度")
|
|
|
|
|
|
lang_conciseness: float = Field(default=0, ge=0, le=100, description="表达精炼度")
|
|
|
|
|
|
lang_literary: float = Field(default=0, ge=0, le=100, description="文笔质量")
|
|
|
|
|
|
lang_controlled_expansion: float = Field(
|
|
|
|
|
|
default=0, ge=0, le=100, description="控制性扩写能力"
|
|
|
|
|
|
)
|
|
|
|
|
|
lang_detail: float = Field(default=0, ge=0, le=100, description="细节还原与强化")
|
|
|
|
|
|
lang_style: float = Field(default=0, ge=0, le=100, description="风格一致性")
|
|
|
|
|
|
|
|
|
|
|
|
emo_authenticity: float = Field(default=0, ge=0, le=100, description="情感真实度")
|
|
|
|
|
|
emo_depth: float = Field(default=0, ge=0, le=100, description="情感深度")
|
|
|
|
|
|
|
|
|
|
|
|
char_understanding: float = Field(default=0, ge=0, le=100, description="人物理解")
|
|
|
|
|
|
char_consistency: float = Field(default=0, ge=0, le=100, description="人物一致性")
|
|
|
|
|
|
char_integration: float = Field(default=0, ge=0, le=100, description="人物融入度")
|
|
|
|
|
|
|
|
|
|
|
|
coh_timeline: float = Field(default=0, ge=0, le=100, description="时间线一致性")
|
|
|
|
|
|
coh_cross_chapter: float = Field(default=0, ge=0, le=100, description="跨章节关联")
|
|
|
|
|
|
|
|
|
|
|
|
rich_analogy: float = Field(default=0, ge=0, le=100, description="类比与引用")
|
|
|
|
|
|
rich_diversity: float = Field(default=0, ge=0, le=100, description="表达多样性")
|
|
|
|
|
|
|
|
|
|
|
|
pub_editorial_cost: float = Field(default=0, ge=0, le=100, description="编辑成本")
|
|
|
|
|
|
pub_completeness: float = Field(default=0, ge=0, le=100, description="完整度")
|
|
|
|
|
|
|
|
|
|
|
|
total_score: float = Field(default=0, ge=0, le=100)
|
2026-04-03 14:44:46 +08:00
|
|
|
|
rationale: str = ""
|
2026-04-07 10:34:59 +08:00
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
major_strengths: list[str] = Field(default_factory=list)
|
|
|
|
|
|
major_issues: list[str] = Field(default_factory=list)
|
|
|
|
|
|
insufficient_evidence: list[str] = Field(default_factory=list)
|
|
|
|
|
|
evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
|
|
|
|
|
|
confidence: float = Field(default=0.75, ge=0.0, le=1.0)
|
|
|
|
|
|
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
authenticity_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
information_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
narrative_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
language_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
emotion_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
character_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
coherence_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
richness_score: float = Field(default=0, ge=0, le=100)
|
|
|
|
|
|
publish_ready_score: float = Field(default=0, ge=0, le=100)
|
2026-04-07 10:34:59 +08:00
|
|
|
|
|
2026-04-07 17:15:01 +08:00
|
|
|
|
@model_validator(mode="before")
|
|
|
|
|
|
@classmethod
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
def _coerce_memoir_judge_input(cls, data: Any) -> Any:
|
|
|
|
|
|
if not isinstance(data, dict):
|
|
|
|
|
|
return data
|
2026-04-10 20:35:57 +08:00
|
|
|
|
data["rationale"] = (
|
|
|
|
|
|
"" if data.get("rationale") is None else str(data["rationale"])
|
|
|
|
|
|
)
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
for key in ("major_strengths", "major_issues", "insufficient_evidence"):
|
|
|
|
|
|
data[key] = _coerce_judge_str_list(data.get(key))
|
|
|
|
|
|
raw_refs = data.get("evidence_refs")
|
|
|
|
|
|
if not isinstance(raw_refs, list):
|
|
|
|
|
|
data["evidence_refs"] = []
|
|
|
|
|
|
else:
|
|
|
|
|
|
clean: list[dict[str, Any]] = []
|
|
|
|
|
|
for item in raw_refs:
|
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
|
continue
|
|
|
|
|
|
clean.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"dimension": str(item.get("dimension", ""))[:200],
|
|
|
|
|
|
"turn_index": _safe_int_bounds(
|
|
|
|
|
|
item.get("turn_index"), default=-1, ge=-1, le=500_000
|
|
|
|
|
|
),
|
|
|
|
|
|
"snippet": str(item.get("snippet", ""))[:400],
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
data["evidence_refs"] = clean
|
|
|
|
|
|
|
|
|
|
|
|
def _loose_score(v: Any) -> float:
|
|
|
|
|
|
if v is None:
|
|
|
|
|
|
return 0.0
|
|
|
|
|
|
try:
|
|
|
|
|
|
x = float(v)
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
return 0.0
|
|
|
|
|
|
if x != x or x in (float("inf"), float("-inf")):
|
|
|
|
|
|
return 0.0
|
|
|
|
|
|
return max(0.0, min(100.0, x))
|
|
|
|
|
|
|
|
|
|
|
|
for fname in _MEMOIR_LEAF_SCORE_BOUNDS:
|
|
|
|
|
|
data[fname] = _loose_score(data.get(fname))
|
|
|
|
|
|
_agg_keys = (
|
|
|
|
|
|
"authenticity_score",
|
|
|
|
|
|
"information_score",
|
|
|
|
|
|
"narrative_score",
|
|
|
|
|
|
"language_score",
|
|
|
|
|
|
"emotion_score",
|
|
|
|
|
|
"character_score",
|
|
|
|
|
|
"coherence_score",
|
|
|
|
|
|
"richness_score",
|
|
|
|
|
|
"publish_ready_score",
|
|
|
|
|
|
"total_score",
|
|
|
|
|
|
)
|
|
|
|
|
|
for fname in _agg_keys:
|
|
|
|
|
|
if fname not in data or data[fname] is None:
|
|
|
|
|
|
continue
|
|
|
|
|
|
data[fname] = _loose_score(data[fname])
|
|
|
|
|
|
if "confidence" in data and data["confidence"] is not None:
|
|
|
|
|
|
try:
|
|
|
|
|
|
c = float(data["confidence"])
|
|
|
|
|
|
if c != c:
|
|
|
|
|
|
raise ValueError
|
|
|
|
|
|
data["confidence"] = max(0.0, min(1.0, c))
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
del data["confidence"]
|
2026-04-07 17:15:01 +08:00
|
|
|
|
return data
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
@model_validator(mode="after")
|
2026-04-07 17:15:01 +08:00
|
|
|
|
def _cap_meta_fields_and_sync_totals(self) -> Self:
|
2026-04-09 15:32:35 +08:00
|
|
|
|
def _cap_str_list(
|
|
|
|
|
|
xs: list[str], *, max_items: int, max_chars: int
|
|
|
|
|
|
) -> list[str]:
|
2026-04-07 17:15:01 +08:00
|
|
|
|
out: list[str] = []
|
|
|
|
|
|
for x in xs[:max_items]:
|
|
|
|
|
|
s = str(x).strip()
|
|
|
|
|
|
if s:
|
|
|
|
|
|
out.append(s[:max_chars])
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
object.__setattr__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
"major_strengths",
|
|
|
|
|
|
_cap_str_list(self.major_strengths, max_items=8, max_chars=200),
|
|
|
|
|
|
)
|
|
|
|
|
|
object.__setattr__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
"major_issues",
|
|
|
|
|
|
_cap_str_list(self.major_issues, max_items=10, max_chars=200),
|
|
|
|
|
|
)
|
|
|
|
|
|
object.__setattr__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
"insufficient_evidence",
|
|
|
|
|
|
_cap_str_list(self.insufficient_evidence, max_items=12, max_chars=200),
|
|
|
|
|
|
)
|
|
|
|
|
|
refs = list(self.evidence_refs)[:12]
|
|
|
|
|
|
object.__setattr__(self, "evidence_refs", refs)
|
|
|
|
|
|
|
feat(eval): memoir A/B chapter judging and eval-web parity with dialogue
- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.
- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.
- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.
- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.
- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.
- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
2026-04-10 10:23:43 +08:00
|
|
|
|
for fname, (lo, hi) in _MEMOIR_LEAF_SCORE_BOUNDS.items():
|
|
|
|
|
|
try:
|
|
|
|
|
|
raw = float(getattr(self, fname))
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
raw = 0.0
|
|
|
|
|
|
object.__setattr__(self, fname, max(lo, min(hi, raw)))
|
|
|
|
|
|
|
2026-04-07 10:34:59 +08:00
|
|
|
|
authenticity = (
|
|
|
|
|
|
self.mem_fidelity
|
|
|
|
|
|
+ self.mem_factual_accuracy
|
|
|
|
|
|
+ self.mem_factual_coverage
|
|
|
|
|
|
+ self.mem_traceability
|
|
|
|
|
|
)
|
|
|
|
|
|
information = (
|
|
|
|
|
|
self.info_slot_coverage + self.info_sufficiency + self.info_density
|
|
|
|
|
|
)
|
|
|
|
|
|
narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
|
|
|
|
|
|
language = (
|
|
|
|
|
|
self.lang_fluency
|
|
|
|
|
|
+ self.lang_conciseness
|
|
|
|
|
|
+ self.lang_literary
|
|
|
|
|
|
+ self.lang_controlled_expansion
|
|
|
|
|
|
+ self.lang_detail
|
|
|
|
|
|
+ self.lang_style
|
|
|
|
|
|
)
|
|
|
|
|
|
emotion = self.emo_authenticity + self.emo_depth
|
|
|
|
|
|
character = (
|
|
|
|
|
|
self.char_understanding + self.char_consistency + self.char_integration
|
|
|
|
|
|
)
|
|
|
|
|
|
coherence = self.coh_timeline + self.coh_cross_chapter
|
|
|
|
|
|
richness = self.rich_analogy + self.rich_diversity
|
|
|
|
|
|
publish = self.pub_editorial_cost + self.pub_completeness
|
|
|
|
|
|
expected = (
|
|
|
|
|
|
authenticity
|
|
|
|
|
|
+ information
|
|
|
|
|
|
+ narrative
|
|
|
|
|
|
+ language
|
|
|
|
|
|
+ emotion
|
|
|
|
|
|
+ character
|
|
|
|
|
|
+ coherence
|
|
|
|
|
|
+ richness
|
|
|
|
|
|
+ publish
|
|
|
|
|
|
)
|
2026-04-08 09:38:07 +08:00
|
|
|
|
synced = max(0.0, min(100.0, round(float(expected), 2)))
|
|
|
|
|
|
object.__setattr__(self, "total_score", synced)
|
2026-04-07 10:34:59 +08:00
|
|
|
|
object.__setattr__(self, "authenticity_score", authenticity)
|
|
|
|
|
|
object.__setattr__(self, "information_score", information)
|
|
|
|
|
|
object.__setattr__(self, "narrative_score", narrative)
|
|
|
|
|
|
object.__setattr__(self, "language_score", language)
|
|
|
|
|
|
object.__setattr__(self, "emotion_score", emotion)
|
|
|
|
|
|
object.__setattr__(self, "character_score", character)
|
|
|
|
|
|
object.__setattr__(self, "coherence_score", coherence)
|
|
|
|
|
|
object.__setattr__(self, "richness_score", richness)
|
|
|
|
|
|
object.__setattr__(self, "publish_ready_score", publish)
|
|
|
|
|
|
return self
|