api/app/features/evaluation/judge_schemas.py

"""评审 LLM 结构化输出（json_object）。

成稿（回忆录）子项上限已自洽为 **总分 100**（由原 110 分表等比例收紧整数档，见附件 rubric）。
"""

from __future__ import annotations

from typing import Any, Self

from pydantic import BaseModel, ConfigDict, Field, model_validator


class JudgeEvidenceRef(BaseModel):
    """评审引用：便于人工复核（对话 Turn 或成稿片段定位）。"""

    model_config = ConfigDict(extra="ignore")

    dimension: str = ""
    turn_index: int = Field(default=-1, ge=-1)
    snippet: str = Field(default="", max_length=400)


def _is_judge_list_placeholder_empty(s: str) -> bool:
    """LLM 有时输出单句占位（如 'None identified.'）而非 JSON 数组，按空列表处理。"""
    t = s.strip()
    if not t:
        return True
    tl = t.lower().rstrip(".")
    if tl in (
        "none",
        "none identified",
        "n/a",
        "na",
        "-",
        "nil",
        "null",
        "no issues",
        "no issue",
        "not applicable",
    ):
        return True
    tc = t.rstrip("。")
    if tc in ("无", "暂无", "未发现", "没有"):
        return True
    return False


def _safe_int_bounds(value: Any, *, default: int, ge: int, le: int) -> int:
    try:
        v = int(value)
    except (TypeError, ValueError):
        return default
    return max(ge, min(le, v))


def _coerce_judge_str_list(value: Any) -> list[Any]:
    """将评审 JSON 中的 list[str] 字段从 str / null 规范为列表（兼容 GLM-5 等输出的非数组形态）。"""
    if value is None:
        return []
    if isinstance(value, list):
        return value
    if isinstance(value, str):
        s = value.strip()
        if _is_judge_list_placeholder_empty(s):
            return []
        return [s]
    return []


class TurnJudgeOutput(BaseModel):
    """单轮 / 整段对话质量（情绪强化版 100 分，15 个细项）。"""

    model_config = ConfigDict(extra="ignore")

    # 一、情绪价值与陪伴感（30）
    emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
    empathy_depth: float = Field(ge=0, le=8, description="共情深度")
    emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
    emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")

    # 二、信息获取能力（25）
    fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
    info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
    info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")

    # 三、人物建模能力（15）
    persona_understanding: float = Field(ge=0, le=7, description="人物理解")
    persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
    persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")

    # 四、结构化引导（15）
    interview_structure: float = Field(ge=0, le=6, description="访谈结构")
    context_memory: float = Field(ge=0, le=5, description="上下文记忆")
    rhythm_control: float = Field(ge=0, le=4, description="节奏控制")

    # 五、提问质量（15）
    question_quality: float = Field(ge=0, le=7, description="问题质量")
    follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
    non_leading: float = Field(ge=0, le=3, description="非引导性")

    total_score: float = Field(ge=0, le=100)
    rationale: str = ""

    major_strengths: list[str] = Field(default_factory=list)
    major_issues: list[str] = Field(default_factory=list)
    insufficient_evidence: list[str] = Field(default_factory=list)
    evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
    confidence: float = Field(default=0.75, ge=0.0, le=1.0)

    # 与历史 JSON 对齐的一级聚合分（由细项派生，可缺省由模型填写）
    emotion_score: float = Field(default=0, ge=0, le=30)
    information_score: float = Field(default=0, ge=0, le=25)
    persona_score: float = Field(default=0, ge=0, le=15)
    structure_score: float = Field(default=0, ge=0, le=15)
    question_score: float = Field(default=0, ge=0, le=15)

    @model_validator(mode="before")
    @classmethod
    def _coerce_null_lists(cls, data: Any) -> Any:
        if isinstance(data, dict):
            for key in ("major_strengths", "major_issues", "insufficient_evidence"):
                data[key] = _coerce_judge_str_list(data.get(key))
            if data.get("evidence_refs") is None:
                data["evidence_refs"] = []
        return data

    @model_validator(mode="after")
    def _cap_meta_fields_and_sync_totals(self) -> Self:
        def _cap_str_list(
            xs: list[str], *, max_items: int, max_chars: int
        ) -> list[str]:
            out: list[str] = []
            for x in xs[:max_items]:
                s = str(x).strip()
                if s:
                    out.append(s[:max_chars])
            return out

        object.__setattr__(
            self,
            "major_strengths",
            _cap_str_list(self.major_strengths, max_items=8, max_chars=200),
        )
        object.__setattr__(
            self,
            "major_issues",
            _cap_str_list(self.major_issues, max_items=10, max_chars=200),
        )
        object.__setattr__(
            self,
            "insufficient_evidence",
            _cap_str_list(self.insufficient_evidence, max_items=10, max_chars=200),
        )
        refs = list(self.evidence_refs)[:12]
        object.__setattr__(self, "evidence_refs", refs)

        emotion = (
            self.emotion_carry
            + self.empathy_depth
            + self.emotion_safety
            + self.emotion_guidance
        )
        information = (
            self.fact_mining + self.info_completeness_guide + self.info_depth_mining
        )
        persona = (
            self.persona_understanding
            + self.persona_consistency_verify
            + self.persona_expression_guide
        )
        structure = self.interview_structure + self.context_memory + self.rhythm_control
        question = self.question_quality + self.follow_up_depth + self.non_leading
        expected = emotion + information + persona + structure + question
        # 细项为唯一事实来源：LLM 常把 total_score 写成 100 与前四项打满但情绪块少 1 分等情况不一致
        synced = max(0.0, min(100.0, round(float(expected), 2)))
        object.__setattr__(self, "total_score", synced)
        object.__setattr__(self, "emotion_score", emotion)
        object.__setattr__(self, "information_score", information)
        object.__setattr__(self, "persona_score", persona)
        object.__setattr__(self, "structure_score", structure)
        object.__setattr__(self, "question_score", question)
        return self


# 整条 transcript 与单轮使用同一套细项
ConversationJudgeOutput = TurnJudgeOutput


# 评审 LLM 常把细项打成「略超满分」的浮点；先钳制再校验，避免整 JSON 丢弃。
_MEMOIR_LEAF_SCORE_BOUNDS: dict[str, tuple[float, float]] = {
    "mem_fidelity": (0, 9),
    "mem_factual_accuracy": (0, 5),
    "mem_factual_coverage": (0, 5),
    "mem_traceability": (0, 4),
    "info_slot_coverage": (0, 6),
    "info_sufficiency": (0, 4),
    "info_density": (0, 4),
    "narr_structure": (0, 6),
    "narr_paragraphs": (0, 5),
    "narr_pacing": (0, 3),
    "lang_fluency": (0, 3),
    "lang_conciseness": (0, 3),
    "lang_literary": (0, 4),
    "lang_controlled_expansion": (0, 4),
    "lang_detail": (0, 2),
    "lang_style": (0, 2),
    "emo_authenticity": (0, 5),
    "emo_depth": (0, 4),
    "char_understanding": (0, 4),
    "char_consistency": (0, 3),
    "char_integration": (0, 2),
    "coh_timeline": (0, 2),
    "coh_cross_chapter": (0, 2),
    "rich_analogy": (0, 3),
    "rich_diversity": (0, 2),
    "pub_editorial_cost": (0, 2),
    "pub_completeness": (0, 2),
}


class MemoirJudgeOutput(BaseModel):
    """成稿回忆录评分（总分 100，子项上限见 rubric）。

    产品优先保留 **文字**（对照说明、改进建议）：细项分值允许模型乱写，入模时先放宽到
    ``0–100``，再在 ``mode=\"after\"`` 中按 rubric 上限钳制并重算 total，避免因分数校验丢整段 JSON。
    """

    model_config = ConfigDict(extra="ignore")

    # 细项：校验放宽到 0–100；真实满分仍以 rubric 为准，由 after 钳制
    mem_fidelity: float = Field(default=0, ge=0, le=100, description="记忆忠实度")
    mem_factual_accuracy: float = Field(
        default=0, ge=0, le=100, description="事实准确性"
    )
    mem_factual_coverage: float = Field(
        default=0, ge=0, le=100, description="事实覆盖率"
    )
    mem_traceability: float = Field(default=0, ge=0, le=100, description="记忆可追溯性")

    info_slot_coverage: float = Field(default=0, ge=0, le=100, description="槽位覆盖度")
    info_sufficiency: float = Field(default=0, ge=0, le=100, description="信息充分性")
    info_density: float = Field(default=0, ge=0, le=100, description="信息密度")

    narr_structure: float = Field(default=0, ge=0, le=100, description="故事结构")
    narr_paragraphs: float = Field(default=0, ge=0, le=100, description="段落组织")
    narr_pacing: float = Field(default=0, ge=0, le=100, description="节奏控制")

    lang_fluency: float = Field(default=0, ge=0, le=100, description="语言流畅度")
    lang_conciseness: float = Field(default=0, ge=0, le=100, description="表达精炼度")
    lang_literary: float = Field(default=0, ge=0, le=100, description="文笔质量")
    lang_controlled_expansion: float = Field(
        default=0, ge=0, le=100, description="控制性扩写能力"
    )
    lang_detail: float = Field(default=0, ge=0, le=100, description="细节还原与强化")
    lang_style: float = Field(default=0, ge=0, le=100, description="风格一致性")

    emo_authenticity: float = Field(default=0, ge=0, le=100, description="情感真实度")
    emo_depth: float = Field(default=0, ge=0, le=100, description="情感深度")

    char_understanding: float = Field(default=0, ge=0, le=100, description="人物理解")
    char_consistency: float = Field(default=0, ge=0, le=100, description="人物一致性")
    char_integration: float = Field(default=0, ge=0, le=100, description="人物融入度")

    coh_timeline: float = Field(default=0, ge=0, le=100, description="时间线一致性")
    coh_cross_chapter: float = Field(default=0, ge=0, le=100, description="跨章节关联")

    rich_analogy: float = Field(default=0, ge=0, le=100, description="类比与引用")
    rich_diversity: float = Field(default=0, ge=0, le=100, description="表达多样性")

    pub_editorial_cost: float = Field(default=0, ge=0, le=100, description="编辑成本")
    pub_completeness: float = Field(default=0, ge=0, le=100, description="完整度")

    total_score: float = Field(default=0, ge=0, le=100)
    rationale: str = ""

    major_strengths: list[str] = Field(default_factory=list)
    major_issues: list[str] = Field(default_factory=list)
    insufficient_evidence: list[str] = Field(default_factory=list)
    evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
    confidence: float = Field(default=0.75, ge=0.0, le=1.0)

    authenticity_score: float = Field(default=0, ge=0, le=100)
    information_score: float = Field(default=0, ge=0, le=100)
    narrative_score: float = Field(default=0, ge=0, le=100)
    language_score: float = Field(default=0, ge=0, le=100)
    emotion_score: float = Field(default=0, ge=0, le=100)
    character_score: float = Field(default=0, ge=0, le=100)
    coherence_score: float = Field(default=0, ge=0, le=100)
    richness_score: float = Field(default=0, ge=0, le=100)
    publish_ready_score: float = Field(default=0, ge=0, le=100)

    @model_validator(mode="before")
    @classmethod
    def _coerce_memoir_judge_input(cls, data: Any) -> Any:
        if not isinstance(data, dict):
            return data
        data["rationale"] = (
            "" if data.get("rationale") is None else str(data["rationale"])
        )
        for key in ("major_strengths", "major_issues", "insufficient_evidence"):
            data[key] = _coerce_judge_str_list(data.get(key))
        raw_refs = data.get("evidence_refs")
        if not isinstance(raw_refs, list):
            data["evidence_refs"] = []
        else:
            clean: list[dict[str, Any]] = []
            for item in raw_refs:
                if not isinstance(item, dict):
                    continue
                clean.append(
                    {
                        "dimension": str(item.get("dimension", ""))[:200],
                        "turn_index": _safe_int_bounds(
                            item.get("turn_index"), default=-1, ge=-1, le=500_000
                        ),
                        "snippet": str(item.get("snippet", ""))[:400],
                    }
                )
            data["evidence_refs"] = clean

        def _loose_score(v: Any) -> float:
            if v is None:
                return 0.0
            try:
                x = float(v)
            except (TypeError, ValueError):
                return 0.0
            if x != x or x in (float("inf"), float("-inf")):
                return 0.0
            return max(0.0, min(100.0, x))

        for fname in _MEMOIR_LEAF_SCORE_BOUNDS:
            data[fname] = _loose_score(data.get(fname))
        _agg_keys = (
            "authenticity_score",
            "information_score",
            "narrative_score",
            "language_score",
            "emotion_score",
            "character_score",
            "coherence_score",
            "richness_score",
            "publish_ready_score",
            "total_score",
        )
        for fname in _agg_keys:
            if fname not in data or data[fname] is None:
                continue
            data[fname] = _loose_score(data[fname])
        if "confidence" in data and data["confidence"] is not None:
            try:
                c = float(data["confidence"])
                if c != c:
                    raise ValueError
                data["confidence"] = max(0.0, min(1.0, c))
            except (TypeError, ValueError):
                del data["confidence"]
        return data

    @model_validator(mode="after")
    def _cap_meta_fields_and_sync_totals(self) -> Self:
        def _cap_str_list(
            xs: list[str], *, max_items: int, max_chars: int
        ) -> list[str]:
            out: list[str] = []
            for x in xs[:max_items]:
                s = str(x).strip()
                if s:
                    out.append(s[:max_chars])
            return out

        object.__setattr__(
            self,
            "major_strengths",
            _cap_str_list(self.major_strengths, max_items=8, max_chars=200),
        )
        object.__setattr__(
            self,
            "major_issues",
            _cap_str_list(self.major_issues, max_items=10, max_chars=200),
        )
        object.__setattr__(
            self,
            "insufficient_evidence",
            _cap_str_list(self.insufficient_evidence, max_items=12, max_chars=200),
        )
        refs = list(self.evidence_refs)[:12]
        object.__setattr__(self, "evidence_refs", refs)

        for fname, (lo, hi) in _MEMOIR_LEAF_SCORE_BOUNDS.items():
            try:
                raw = float(getattr(self, fname))
            except (TypeError, ValueError):
                raw = 0.0
            object.__setattr__(self, fname, max(lo, min(hi, raw)))

        authenticity = (
            self.mem_fidelity
            + self.mem_factual_accuracy
            + self.mem_factual_coverage
            + self.mem_traceability
        )
        information = (
            self.info_slot_coverage + self.info_sufficiency + self.info_density
        )
        narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
        language = (
            self.lang_fluency
            + self.lang_conciseness
            + self.lang_literary
            + self.lang_controlled_expansion
            + self.lang_detail
            + self.lang_style
        )
        emotion = self.emo_authenticity + self.emo_depth
        character = (
            self.char_understanding + self.char_consistency + self.char_integration
        )
        coherence = self.coh_timeline + self.coh_cross_chapter
        richness = self.rich_analogy + self.rich_diversity
        publish = self.pub_editorial_cost + self.pub_completeness
        expected = (
            authenticity
            + information
            + narrative
            + language
            + emotion
            + character
            + coherence
            + richness
            + publish
        )
        synced = max(0.0, min(100.0, round(float(expected), 2)))
        object.__setattr__(self, "total_score", synced)
        object.__setattr__(self, "authenticity_score", authenticity)
        object.__setattr__(self, "information_score", information)
        object.__setattr__(self, "narrative_score", narrative)
        object.__setattr__(self, "language_score", language)
        object.__setattr__(self, "emotion_score", emotion)
        object.__setattr__(self, "character_score", character)
        object.__setattr__(self, "coherence_score", coherence)
        object.__setattr__(self, "richness_score", richness)
        object.__setattr__(self, "publish_ready_score", publish)
        return self
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								"""评审 LLM 结构化输出（json_object）。
 								成稿（回忆录）子项上限已自洽为 **总分 100**（由原 110 分表等比例收紧整数档，见附件 rubric）。
 								"""
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								from __future__ import annotations
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								from typing import Any, Self
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								from pydantic import BaseModel, ConfigDict, Field, model_validator
 								class JudgeEvidenceRef(BaseModel):
 								    """评审引用：便于人工复核（对话 Turn 或成稿片段定位）。"""
 								    model_config = ConfigDict(extra="ignore")
 								    dimension: str = ""
 								    turn_index: int = Field(default=-1, ge=-1)
 								    snippet: str = Field(default="", max_length=400)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								def _is_judge_list_placeholder_empty(s: str) -> bool:
 								    """LLM 有时输出单句占位（如 'None identified.'）而非 JSON 数组，按空列表处理。"""
 								    t = s.strip()
 								    if not t:
 								        return True
 								    tl = t.lower().rstrip(".")
 								    if tl in (
 								        "none",
 								        "none identified",
 								        "n/a",
 								        "na",
 								        "-",
 								        "nil",
 								        "null",
 								        "no issues",
 								        "no issue",
 								        "not applicable",
 								    ):
 								        return True
 								    tc = t.rstrip("。")
 								    if tc in ("无", "暂无", "未发现", "没有"):
 								        return True
 								    return False
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								def _safe_int_bounds(value: Any, *, default: int, ge: int, le: int) -> int:
 								    try:
 								        v = int(value)
 								    except (TypeError, ValueError):
 								        return default
 								    return max(ge, min(le, v))
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								def _coerce_judge_str_list(value: Any) -> list[Any]:
 								    """将评审 JSON 中的 list[str] 字段从 str / null 规范为列表（兼容 GLM-5 等输出的非数组形态）。"""
 								    if value is None:
 								        return []
 								    if isinstance(value, list):
 								        return value
 								    if isinstance(value, str):
 								        s = value.strip()
 								        if _is_judge_list_placeholder_empty(s):
 								            return []
 								        return [s]
 								    return []
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								class TurnJudgeOutput(BaseModel):
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    """单轮 / 整段对话质量（情绪强化版 100 分，15 个细项）。"""
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    model_config = ConfigDict(extra="ignore")
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    # 一、情绪价值与陪伴感（30）
 								    emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
 								    empathy_depth: float = Field(ge=0, le=8, description="共情深度")
 								    emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
 								    emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
 								    # 二、信息获取能力（25）
 								    fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
 								    info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
 								    info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
 								    # 三、人物建模能力（15）
 								    persona_understanding: float = Field(ge=0, le=7, description="人物理解")
 								    persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
 								    persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    # 四、结构化引导（15）
 								    interview_structure: float = Field(ge=0, le=6, description="访谈结构")
 								    context_memory: float = Field(ge=0, le=5, description="上下文记忆")
 								    rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    # 五、提问质量（15）
 								    question_quality: float = Field(ge=0, le=7, description="问题质量")
 								    follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
 								    non_leading: float = Field(ge=0, le=3, description="非引导性")
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								    total_score: float = Field(ge=0, le=100)
 								    rationale: str = ""
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    major_strengths: list[str] = Field(default_factory=list)
 								    major_issues: list[str] = Field(default_factory=list)
 								    insufficient_evidence: list[str] = Field(default_factory=list)
 								    evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
 								    confidence: float = Field(default=0.75, ge=0.0, le=1.0)
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    # 与历史 JSON 对齐的一级聚合分（由细项派生，可缺省由模型填写）
 								    emotion_score: float = Field(default=0, ge=0, le=30)
 								    information_score: float = Field(default=0, ge=0, le=25)
 								    persona_score: float = Field(default=0, ge=0, le=15)
 								    structure_score: float = Field(default=0, ge=0, le=15)
 								    question_score: float = Field(default=0, ge=0, le=15)
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    @model_validator(mode="before")
 								    @classmethod
 								    def _coerce_null_lists(cls, data: Any) -> Any:
 								        if isinstance(data, dict):
 								            for key in ("major_strengths", "major_issues", "insufficient_evidence"):
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								                data[key] = _coerce_judge_str_list(data.get(key))
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            if data.get("evidence_refs") is None:
 								                data["evidence_refs"] = []
 								        return data
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    @model_validator(mode="after")
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    def _cap_meta_fields_and_sync_totals(self) -> Self:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								        def _cap_str_list(
 								            xs: list[str], *, max_items: int, max_chars: int
 								        ) -> list[str]:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            out: list[str] = []
 								            for x in xs[:max_items]:
 								                s = str(x).strip()
 								                if s:
 								                    out.append(s[:max_chars])
 								            return out
 								        object.__setattr__(
 								            self,
 								            "major_strengths",
 								            _cap_str_list(self.major_strengths, max_items=8, max_chars=200),
 								        )
 								        object.__setattr__(
 								            self,
 								            "major_issues",
 								            _cap_str_list(self.major_issues, max_items=10, max_chars=200),
 								        )
 								        object.__setattr__(
 								            self,
 								            "insufficient_evidence",
 								            _cap_str_list(self.insufficient_evidence, max_items=10, max_chars=200),
 								        )
 								        refs = list(self.evidence_refs)[:12]
 								        object.__setattr__(self, "evidence_refs", refs)
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								        emotion = (
 								            self.emotion_carry
 								            + self.empathy_depth
 								            + self.emotion_safety
 								            + self.emotion_guidance
 								        )
 								        information = (
 								            self.fact_mining + self.info_completeness_guide + self.info_depth_mining
 								        )
 								        persona = (
 								            self.persona_understanding
 								            + self.persona_consistency_verify
 								            + self.persona_expression_guide
 								        )
 								        structure = self.interview_structure + self.context_memory + self.rhythm_control
 								        question = self.question_quality + self.follow_up_depth + self.non_leading
 								        expected = emotion + information + persona + structure + question
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								        # 细项为唯一事实来源：LLM 常把 total_score 写成 100 与前四项打满但情绪块少 1 分等情况不一致
 								        synced = max(0.0, min(100.0, round(float(expected), 2)))
 								        object.__setattr__(self, "total_score", synced)
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								        object.__setattr__(self, "emotion_score", emotion)
 								        object.__setattr__(self, "information_score", information)
 								        object.__setattr__(self, "persona_score", persona)
 								        object.__setattr__(self, "structure_score", structure)
 								        object.__setattr__(self, "question_score", question)
 								        return self
 								# 整条 transcript 与单轮使用同一套细项
 								ConversationJudgeOutput = TurnJudgeOutput
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								# 评审 LLM 常把细项打成「略超满分」的浮点；先钳制再校验，避免整 JSON 丢弃。
 								_MEMOIR_LEAF_SCORE_BOUNDS: dict[str, tuple[float, float]] = {
 								    "mem_fidelity": (0, 9),
 								    "mem_factual_accuracy": (0, 5),
 								    "mem_factual_coverage": (0, 5),
 								    "mem_traceability": (0, 4),
 								    "info_slot_coverage": (0, 6),
 								    "info_sufficiency": (0, 4),
 								    "info_density": (0, 4),
 								    "narr_structure": (0, 6),
 								    "narr_paragraphs": (0, 5),
 								    "narr_pacing": (0, 3),
 								    "lang_fluency": (0, 3),
 								    "lang_conciseness": (0, 3),
 								    "lang_literary": (0, 4),
 								    "lang_controlled_expansion": (0, 4),
 								    "lang_detail": (0, 2),
 								    "lang_style": (0, 2),
 								    "emo_authenticity": (0, 5),
 								    "emo_depth": (0, 4),
 								    "char_understanding": (0, 4),
 								    "char_consistency": (0, 3),
 								    "char_integration": (0, 2),
 								    "coh_timeline": (0, 2),
 								    "coh_cross_chapter": (0, 2),
 								    "rich_analogy": (0, 3),
 								    "rich_diversity": (0, 2),
 								    "pub_editorial_cost": (0, 2),
 								    "pub_completeness": (0, 2),
 								}
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								class MemoirJudgeOutput(BaseModel):
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								    """成稿回忆录评分（总分 100，子项上限见 rubric）。
 								    产品优先保留 **文字**（对照说明、改进建议）：细项分值允许模型乱写，入模时先放宽到
 								    ``0–100``，再在 ``mode=\"after\"`` 中按 rubric 上限钳制并重算 total，避免因分数校验丢整段 JSON。
 								    """
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    model_config = ConfigDict(extra="ignore")
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								    # 细项：校验放宽到 0–100；真实满分仍以 rubric 为准，由 after 钳制
 								    mem_fidelity: float = Field(default=0, ge=0, le=100, description="记忆忠实度")
-												fix:
1. 修复登录界面文字被遮挡问题
2. 大字模式关闭后显示异常问题
3. 重新调整大字模式是否开启时的字体显示效果

											
										
										
											2026-04-10 20:35:57 +08:00
+								    mem_factual_accuracy: float = Field(
 								        default=0, ge=0, le=100, description="事实准确性"
 								    )
 								    mem_factual_coverage: float = Field(
 								        default=0, ge=0, le=100, description="事实覆盖率"
 								    )
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								    mem_traceability: float = Field(default=0, ge=0, le=100, description="记忆可追溯性")
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								    info_slot_coverage: float = Field(default=0, ge=0, le=100, description="槽位覆盖度")
 								    info_sufficiency: float = Field(default=0, ge=0, le=100, description="信息充分性")
 								    info_density: float = Field(default=0, ge=0, le=100, description="信息密度")
 								    narr_structure: float = Field(default=0, ge=0, le=100, description="故事结构")
 								    narr_paragraphs: float = Field(default=0, ge=0, le=100, description="段落组织")
 								    narr_pacing: float = Field(default=0, ge=0, le=100, description="节奏控制")
 								    lang_fluency: float = Field(default=0, ge=0, le=100, description="语言流畅度")
 								    lang_conciseness: float = Field(default=0, ge=0, le=100, description="表达精炼度")
 								    lang_literary: float = Field(default=0, ge=0, le=100, description="文笔质量")
 								    lang_controlled_expansion: float = Field(
 								        default=0, ge=0, le=100, description="控制性扩写能力"
 								    )
 								    lang_detail: float = Field(default=0, ge=0, le=100, description="细节还原与强化")
 								    lang_style: float = Field(default=0, ge=0, le=100, description="风格一致性")
 								    emo_authenticity: float = Field(default=0, ge=0, le=100, description="情感真实度")
 								    emo_depth: float = Field(default=0, ge=0, le=100, description="情感深度")
 								    char_understanding: float = Field(default=0, ge=0, le=100, description="人物理解")
 								    char_consistency: float = Field(default=0, ge=0, le=100, description="人物一致性")
 								    char_integration: float = Field(default=0, ge=0, le=100, description="人物融入度")
 								    coh_timeline: float = Field(default=0, ge=0, le=100, description="时间线一致性")
 								    coh_cross_chapter: float = Field(default=0, ge=0, le=100, description="跨章节关联")
 								    rich_analogy: float = Field(default=0, ge=0, le=100, description="类比与引用")
 								    rich_diversity: float = Field(default=0, ge=0, le=100, description="表达多样性")
 								    pub_editorial_cost: float = Field(default=0, ge=0, le=100, description="编辑成本")
 								    pub_completeness: float = Field(default=0, ge=0, le=100, description="完整度")
 								    total_score: float = Field(default=0, ge=0, le=100)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    rationale: str = ""
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    major_strengths: list[str] = Field(default_factory=list)
 								    major_issues: list[str] = Field(default_factory=list)
 								    insufficient_evidence: list[str] = Field(default_factory=list)
 								    evidence_refs: list[JudgeEvidenceRef] = Field(default_factory=list)
 								    confidence: float = Field(default=0.75, ge=0.0, le=1.0)
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								    authenticity_score: float = Field(default=0, ge=0, le=100)
 								    information_score: float = Field(default=0, ge=0, le=100)
 								    narrative_score: float = Field(default=0, ge=0, le=100)
 								    language_score: float = Field(default=0, ge=0, le=100)
 								    emotion_score: float = Field(default=0, ge=0, le=100)
 								    character_score: float = Field(default=0, ge=0, le=100)
 								    coherence_score: float = Field(default=0, ge=0, le=100)
 								    richness_score: float = Field(default=0, ge=0, le=100)
 								    publish_ready_score: float = Field(default=0, ge=0, le=100)
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    @model_validator(mode="before")
 								    @classmethod
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								    def _coerce_memoir_judge_input(cls, data: Any) -> Any:
 								        if not isinstance(data, dict):
 								            return data
-												fix:
1. 修复登录界面文字被遮挡问题
2. 大字模式关闭后显示异常问题
3. 重新调整大字模式是否开启时的字体显示效果

											
										
										
											2026-04-10 20:35:57 +08:00
+								        data["rationale"] = (
 								            "" if data.get("rationale") is None else str(data["rationale"])
 								        )
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								        for key in ("major_strengths", "major_issues", "insufficient_evidence"):
 								            data[key] = _coerce_judge_str_list(data.get(key))
 								        raw_refs = data.get("evidence_refs")
 								        if not isinstance(raw_refs, list):
 								            data["evidence_refs"] = []
 								        else:
 								            clean: list[dict[str, Any]] = []
 								            for item in raw_refs:
 								                if not isinstance(item, dict):
 								                    continue
 								                clean.append(
 								                    {
 								                        "dimension": str(item.get("dimension", ""))[:200],
 								                        "turn_index": _safe_int_bounds(
 								                            item.get("turn_index"), default=-1, ge=-1, le=500_000
 								                        ),
 								                        "snippet": str(item.get("snippet", ""))[:400],
 								                    }
 								                )
 								            data["evidence_refs"] = clean
 								        def _loose_score(v: Any) -> float:
 								            if v is None:
 								                return 0.0
 								            try:
 								                x = float(v)
 								            except (TypeError, ValueError):
 								                return 0.0
 								            if x != x or x in (float("inf"), float("-inf")):
 								                return 0.0
 								            return max(0.0, min(100.0, x))
 								        for fname in _MEMOIR_LEAF_SCORE_BOUNDS:
 								            data[fname] = _loose_score(data.get(fname))
 								        _agg_keys = (
 								            "authenticity_score",
 								            "information_score",
 								            "narrative_score",
 								            "language_score",
 								            "emotion_score",
 								            "character_score",
 								            "coherence_score",
 								            "richness_score",
 								            "publish_ready_score",
 								            "total_score",
 								        )
 								        for fname in _agg_keys:
 								            if fname not in data or data[fname] is None:
 								                continue
 								            data[fname] = _loose_score(data[fname])
 								        if "confidence" in data and data["confidence"] is not None:
 								            try:
 								                c = float(data["confidence"])
 								                if c != c:
 								                    raise ValueError
 								                data["confidence"] = max(0.0, min(1.0, c))
 								            except (TypeError, ValueError):
 								                del data["confidence"]
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        return data
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    @model_validator(mode="after")
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    def _cap_meta_fields_and_sync_totals(self) -> Self:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								        def _cap_str_list(
 								            xs: list[str], *, max_items: int, max_chars: int
 								        ) -> list[str]:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            out: list[str] = []
 								            for x in xs[:max_items]:
 								                s = str(x).strip()
 								                if s:
 								                    out.append(s[:max_chars])
 								            return out
 								        object.__setattr__(
 								            self,
 								            "major_strengths",
 								            _cap_str_list(self.major_strengths, max_items=8, max_chars=200),
 								        )
 								        object.__setattr__(
 								            self,
 								            "major_issues",
 								            _cap_str_list(self.major_issues, max_items=10, max_chars=200),
 								        )
 								        object.__setattr__(
 								            self,
 								            "insufficient_evidence",
 								            _cap_str_list(self.insufficient_evidence, max_items=12, max_chars=200),
 								        )
 								        refs = list(self.evidence_refs)[:12]
 								        object.__setattr__(self, "evidence_refs", refs)
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								        for fname, (lo, hi) in _MEMOIR_LEAF_SCORE_BOUNDS.items():
 								            try:
 								                raw = float(getattr(self, fname))
 								            except (TypeError, ValueError):
 								                raw = 0.0
 								            object.__setattr__(self, fname, max(lo, min(hi, raw)))
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								        authenticity = (
 								            self.mem_fidelity
 								            + self.mem_factual_accuracy
 								            + self.mem_factual_coverage
 								            + self.mem_traceability
 								        )
 								        information = (
 								            self.info_slot_coverage + self.info_sufficiency + self.info_density
 								        )
 								        narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
 								        language = (
 								            self.lang_fluency
 								            + self.lang_conciseness
 								            + self.lang_literary
 								            + self.lang_controlled_expansion
 								            + self.lang_detail
 								            + self.lang_style
 								        )
 								        emotion = self.emo_authenticity + self.emo_depth
 								        character = (
 								            self.char_understanding + self.char_consistency + self.char_integration
 								        )
 								        coherence = self.coh_timeline + self.coh_cross_chapter
 								        richness = self.rich_analogy + self.rich_diversity
 								        publish = self.pub_editorial_cost + self.pub_completeness
 								        expected = (
 								            authenticity
 								            + information
 								            + narrative
 								            + language
 								            + emotion
 								            + character
 								            + coherence
 								            + richness
 								            + publish
 								        )
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								        synced = max(0.0, min(100.0, round(float(expected), 2)))
 								        object.__setattr__(self, "total_score", synced)
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								        object.__setattr__(self, "authenticity_score", authenticity)
 								        object.__setattr__(self, "information_score", information)
 								        object.__setattr__(self, "narrative_score", narrative)
 								        object.__setattr__(self, "language_score", language)
 								        object.__setattr__(self, "emotion_score", emotion)
 								        object.__setattr__(self, "character_score", character)
 								        object.__setattr__(self, "coherence_score", coherence)
 								        object.__setattr__(self, "richness_score", richness)
 								        object.__setattr__(self, "publish_ready_score", publish)
 								        return self