feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total - judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分 - 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断） - 访谈打分仍为情绪强化版 15 细项、总分 100 - 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑 - 新增 judge schema 与 memoir prompt 组装的单元测试
2026-04-07 10:34:59 +08:00
parent ea97427767
commit 5972b0e721
9 changed files with 616 additions and 235 deletions
--- a/api/app/features/evaluation/execution_service.py
+++ b/api/app/features/evaluation/execution_service.py
@@ -26,6 +26,8 @@ logger = get_logger(__name__)
 _MAX_JUDGE_MARKDOWN_CHARS = 20_000
 _MAX_EVAL_CHAPTERS = 30
 _MAX_EVAL_STORIES = 40
+_MAX_EVIDENCE_CONVERSATIONS = 8
+_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000


 def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -56,6 +58,50 @@ def _assistant_text_for_eval_display(raw: str) -> str:
    return (raw or "").replace("[SPLIT]", "\n")


+def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
+    s = (text or "").strip()
+    if len(s) <= max_chars:
+        return s
+    return f"{s[:max_chars]}\n\n…（访谈证据已截断）"
+
+
+def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
+    parts: list[str] = []
+    for role, content in pairs:
+        body = (content or "").strip()
+        if not body:
+            continue
+        label = "用户" if role == "human" else "AI"
+        out = _assistant_text_for_eval_display(body) if role != "human" else body
+        parts.append(f"{label}: {out}")
+    return "\n\n".join(parts)
+
+
+async def _conversation_transcript_for_eval(
+    db: AsyncSession, conversation_id: str
+) -> str:
+    from app.features.conversation import repo as conversation_repo
+
+    rows = await conversation_repo.get_conversation_messages(conversation_id, db)
+    return _dialogue_transcript_from_pairs(
+        [(str(row.role or "").lower(), str(row.content or "")) for row in rows]
+    )
+
+
+async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
+    from app.features.conversation import repo as conversation_repo
+
+    conversations = await conversation_repo.get_user_conversations(user_id, db)
+    if not conversations:
+        return ""
+    parts: list[str] = []
+    for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
+        transcript = await _conversation_transcript_for_eval(db, str(conv.id))
+        if transcript:
+            parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
+    return _trim_evidence_text("\n\n".join(parts))
+
+
 async def execute_eval_run(
    db: AsyncSession,
    *,
@@ -150,7 +196,7 @@ async def execute_eval_run(
        rationale = tj.rationale if tj else None
        await eval_repo.add_turn(
            db,
-            run_id=run.id,
+            run_id=str(run.id),
            turn_index=idx,
            user_utterance=u,
            assistant_reply=replies[idx],
@@ -166,11 +212,36 @@ async def execute_eval_run(
    conv_total = conv_out.total_score if conv_out else None

    memoir_md = simple_memoir_from_transcript(utterances, replies)
-    mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
+    source_transcript = _trim_evidence_text(full_transcript)
+    reference_memoir = (case.reference_memoir_markdown or "").strip()
+    mem_out = await judge.judge_memoir(
+        memoir_markdown=memoir_md,
+        source_transcript=source_transcript,
+        reference_memoir_markdown=reference_memoir,
+        evidence_notes="严格按文档核对真实性、覆盖率、可追溯性；以原始访谈为主，参考基线仅作辅助。",
+    )

    chapter_entries: list[dict[str, Any]] = []
    story_entries: list[dict[str, Any]] = []
    uid = (case.source_user_id or "").strip()
+    source_conversation_id = (case.source_conversation_id or "").strip()
+    evidence_transcript = source_transcript
+    if source_conversation_id:
+        try:
+            conversation_evidence = await _conversation_transcript_for_eval(
+                db, source_conversation_id
+            )
+            if conversation_evidence:
+                evidence_transcript = _trim_evidence_text(conversation_evidence)
+        except Exception as e:
+            logger.warning("eval source conversation evidence skipped: {}", e)
+    elif uid:
+        try:
+            user_evidence = await _user_transcript_evidence(db, uid)
+            if user_evidence:
+                evidence_transcript = user_evidence
+        except Exception as e:
+            logger.warning("eval user transcript evidence skipped: {}", e)
    if uid:
        from app.features.memoir.repo import get_chapters_for_memoir_list
        from app.features.story.repo import get_stories_for_user
@@ -184,7 +255,14 @@ async def execute_eval_run(
                if not body:
                    continue
                md = f"# 章节：{ch.title}\n\n{_clip_md_for_judge(body)}"
-                cj = await judge.judge_memoir(memoir_markdown=md)
+                cj = await judge.judge_memoir(
+                    memoir_markdown=md,
+                    source_transcript=evidence_transcript,
+                    reference_memoir_markdown=reference_memoir,
+                    evidence_notes=(
+                        "这是用户现有章节的严格评审；真实性、覆盖率、可追溯性必须对照原始访谈证据。"
+                    ),
+                )
                chapter_entries.append(
                    {
                        "id": ch.id,
@@ -203,7 +281,14 @@ async def execute_eval_run(
                if not body:
                    continue
                md = f"# 故事：{st.title}\n\n{_clip_md_for_judge(body)}"
-                sj = await judge.judge_memoir(memoir_markdown=md)
+                sj = await judge.judge_memoir(
+                    memoir_markdown=md,
+                    source_transcript=evidence_transcript,
+                    reference_memoir_markdown=reference_memoir,
+                    evidence_notes=(
+                        "这是用户现有故事的严格评审；真实性、覆盖率、可追溯性必须对照原始访谈证据。"
+                    ),
+                )
                story_entries.append(
                    {
                        "id": st.id,
@@ -228,8 +313,12 @@ async def execute_eval_run(
            mem_parts.append(float(j["total_score"]))
    mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None

-    exp = await eval_repo.get_experiment(db, run.experiment_id)
-    weights = exp.composite_weights_json if exp else None
+    exp = await eval_repo.get_experiment(db, str(run.experiment_id))
+    weights = (
+        exp.composite_weights_json
+        if exp and isinstance(exp.composite_weights_json, dict)
+        else None
+    )
    comp = _composite(conv_total, mem_total, weights)

    bundle: dict[str, Any] = {
@@ -257,13 +346,13 @@ async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> Non
    exp = await eval_repo.get_experiment(db, experiment_id)
    if not exp:
        return
-    cases = await eval_repo.list_cases(db, exp.regression_set_id)
+    cases = await eval_repo.list_cases(db, str(exp.regression_set_id))

-    incomplete = [r for r in runs if r.status not in ("completed", "failed")]
+    incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
    if incomplete:
        return

-    failed = [r for r in runs if r.status == "failed"]
+    failed = [r for r in runs if str(r.status) == "failed"]
    if failed:
        await eval_repo.update_experiment(
            db,
@@ -301,10 +390,10 @@ async def execute_experiment_full(experiment_id: str) -> None:
        await eval_repo.update_experiment(db, exp, status="running")
        await db.commit()

-        cases = await eval_repo.list_cases(db, exp.regression_set_id)
-        base_v = await eval_repo.get_version(db, exp.baseline_version_id)
-        cand_v = await eval_repo.get_version(db, exp.candidate_version_id)
-        if not base_v or not cand_v:
+        cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
+        base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
+        cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
+        if base_v is None or cand_v is None:
            await eval_repo.update_experiment(
                db,
                exp,
@@ -317,12 +406,12 @@ async def execute_experiment_full(experiment_id: str) -> None:

        for case in cases:
            for side, ver in ("baseline", base_v), ("candidate", cand_v):
-                run = await eval_repo.get_run(db, experiment_id, case.id, side)
+                run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
                if not run:
                    run = await eval_repo.create_run(
                        db,
                        experiment_id=experiment_id,
-                        case_id=case.id,
+                        case_id=str(case.id),
                        side=side,
                    )
                await db.commit()
--- a/api/app/features/evaluation/judge_manual_service.py
+++ b/api/app/features/evaluation/judge_manual_service.py
@@ -10,6 +10,7 @@ from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.dependencies import get_eval_judge_langchain_llm
 from app.core.logging import get_logger
+from app.features.conversation import repo as conversation_repo
 from app.features.evaluation.errors import (
    EvaluationBadRequestError,
    EvaluationNotFoundError,
@@ -27,6 +28,8 @@ logger = get_logger(__name__)
 _MAX_JUDGE_MARKDOWN_CHARS = 20_000
 _MAX_EVAL_CHAPTERS = 30
 _MAX_EVAL_STORIES = 40
+_MAX_EVIDENCE_CONVERSATIONS = 8
+_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000


 def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -48,6 +51,41 @@ def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
    return "\n\n".join(parts)


+def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
+    s = (text or "").strip()
+    if len(s) <= max_chars:
+        return s
+    return f"{s[:max_chars]}\n\n…（访谈证据已截断）"
+
+
+async def _conversation_transcript_for_manual(
+    db: AsyncSession, conversation_id: str
+) -> str:
+    rows = await conversation_repo.get_conversation_messages(conversation_id, db)
+    parts: list[str] = []
+    for row in rows:
+        role = (row.role or "").lower()
+        body = (row.content or "").strip()
+        if not body:
+            continue
+        label = "用户" if role == "human" else "AI"
+        out = _assistant_text_for_eval_display(body) if role != "human" else body
+        parts.append(f"{label}: {out}")
+    return "\n\n".join(parts)
+
+
+async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
+    conversations = await conversation_repo.get_user_conversations(user_id, db)
+    if not conversations:
+        return ""
+    parts: list[str] = []
+    for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
+        transcript = await _conversation_transcript_for_manual(db, str(conv.id))
+        if transcript:
+            parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
+    return _trim_evidence_text("\n\n".join(parts))
+
+
 def _normalize_title_key(title: str) -> str:
    t = (title or "").strip().lower()
    t = re.sub(r"^#+\s*", "", t)
@@ -271,6 +309,7 @@ class EvalJudgeManualService:
        judge_llm = get_eval_judge_langchain_llm()
        judge = EvalJudgeService(judge_llm)
        baselines = list(baseline_sections or [])
+        evidence_transcript = await _user_transcript_evidence(self._db, uid)

        chapter_results: list[dict[str, Any]] = []
        try:
@@ -281,7 +320,7 @@ class EvalJudgeManualService:
                body = (ch.canonical_markdown or "").strip()
                if not body:
                    continue
-                bl = _baseline_for_chapter_title(baselines, ch.title or "", i)
+                bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
                baseline_excerpt = ""
                if bl and (bl.body or "").strip():
                    baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
@@ -289,7 +328,14 @@ class EvalJudgeManualService:
                if baseline_excerpt:
                    md += f"## 导出基线（节选）\n\n{baseline_excerpt}\n\n"
                md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
-                cj = await judge.judge_memoir(memoir_markdown=md)
+                cj = await judge.judge_memoir(
+                    memoir_markdown=md,
+                    source_transcript=evidence_transcript,
+                    reference_memoir_markdown=baseline_excerpt,
+                    evidence_notes=(
+                        "严格按文档打分；真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
+                    ),
+                )
                chapter_results.append(
                    {
                        "id": ch.id,
@@ -310,7 +356,13 @@ class EvalJudgeManualService:
                if not body:
                    continue
                md = f"# 故事：{st.title}\n\n{_clip_md_for_judge(body)}"
-                sj = await judge.judge_memoir(memoir_markdown=md)
+                sj = await judge.judge_memoir(
+                    memoir_markdown=md,
+                    source_transcript=evidence_transcript,
+                    evidence_notes=(
+                        "严格按文档打分；真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
+                    ),
+                )
                story_results.append(
                    {
                        "id": st.id,
--- a/api/app/features/evaluation/judge_schemas.py
+++ b/api/app/features/evaluation/judge_schemas.py
@@ -1,43 +1,199 @@
-"""评审 LLM 结构化输出（json_object）。"""
+"""评审 LLM 结构化输出（json_object）。
+
+成稿（回忆录）子项上限已自洽为 **总分 100**（由原 110 分表等比例收紧整数档，见附件 rubric）。
+"""

 from __future__ import annotations

-from pydantic import BaseModel, Field
+from typing import Self
+
+from pydantic import BaseModel, Field, model_validator


 class TurnJudgeOutput(BaseModel):
-    """单轮对话质量（情绪 + 流畅度/重复抑制 + 总分）。"""
+    """单轮 / 整段对话质量（情绪强化版 100 分，15 个细项）。"""
+
+    # 一、情绪价值与陪伴感（30）
+    emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
+    empathy_depth: float = Field(ge=0, le=8, description="共情深度")
+    emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
+    emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
+
+    # 二、信息获取能力（25）
+    fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
+    info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
+    info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
+
+    # 三、人物建模能力（15）
+    persona_understanding: float = Field(ge=0, le=7, description="人物理解")
+    persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
+    persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
+
+    # 四、结构化引导（15）
+    interview_structure: float = Field(ge=0, le=6, description="访谈结构")
+    context_memory: float = Field(ge=0, le=5, description="上下文记忆")
+    rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
+
+    # 五、提问质量（15）
+    question_quality: float = Field(ge=0, le=7, description="问题质量")
+    follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
+    non_leading: float = Field(ge=0, le=3, description="非引导性")

    total_score: float = Field(ge=0, le=100)
+    rationale: str = ""
+
+    # 与历史 JSON 对齐的一级聚合分（由细项派生，可缺省由模型填写）
    emotion_score: float = Field(default=0, ge=0, le=30)
-    information_score: float = Field(default=0, ge=0, le=20)
-    structure_score: float = Field(default=0, ge=0, le=10)
-    question_score: float = Field(default=0, ge=0, le=10)
-    persona_score: float = Field(default=0, ge=0, le=10)
-    repetition_score: float = Field(default=0, ge=0, le=10)
-    naturalness_score: float = Field(default=0, ge=0, le=10)
-    rationale: str = ""
+    information_score: float = Field(default=0, ge=0, le=25)
+    persona_score: float = Field(default=0, ge=0, le=15)
+    structure_score: float = Field(default=0, ge=0, le=15)
+    question_score: float = Field(default=0, ge=0, le=15)
+
+    @model_validator(mode="after")
+    def _sync_aggregates_and_total(self) -> Self:
+        emotion = (
+            self.emotion_carry
+            + self.empathy_depth
+            + self.emotion_safety
+            + self.emotion_guidance
+        )
+        information = (
+            self.fact_mining + self.info_completeness_guide + self.info_depth_mining
+        )
+        persona = (
+            self.persona_understanding
+            + self.persona_consistency_verify
+            + self.persona_expression_guide
+        )
+        structure = self.interview_structure + self.context_memory + self.rhythm_control
+        question = self.question_quality + self.follow_up_depth + self.non_leading
+        expected = emotion + information + persona + structure + question
+        if abs(expected - self.total_score) > 0.51:
+            raise ValueError(
+                f"total_score ({self.total_score}) 与细项合计 ({expected:.2f}) 不一致"
+            )
+        object.__setattr__(self, "emotion_score", emotion)
+        object.__setattr__(self, "information_score", information)
+        object.__setattr__(self, "persona_score", persona)
+        object.__setattr__(self, "structure_score", structure)
+        object.__setattr__(self, "question_score", question)
+        return self


-class ConversationJudgeOutput(BaseModel):
-    """整条对话 transcript 的综合分。"""
-
-    total_score: float = Field(ge=0, le=100)
-    dimension_scores: dict[str, float] = Field(default_factory=dict)
-    rationale: str = ""
+# 整条 transcript 与单轮使用同一套细项
+ConversationJudgeOutput = TurnJudgeOutput


 class MemoirJudgeOutput(BaseModel):
-    """成稿回忆录评分。"""
+    """成稿回忆录评分（总分 100，子项上限见 rubric）。"""
+
+    # 一、真实性与覆盖（小计最高 23；由原 25 收紧）
+    mem_fidelity: float = Field(ge=0, le=9, description="记忆忠实度")
+    mem_factual_accuracy: float = Field(ge=0, le=5, description="事实准确性")
+    mem_factual_coverage: float = Field(ge=0, le=5, description="事实覆盖率")
+    mem_traceability: float = Field(ge=0, le=4, description="记忆可追溯性")
+
+    # 二、信息质量（小计最高 14；由原 15 收紧）
+    info_slot_coverage: float = Field(ge=0, le=6, description="槽位覆盖度")
+    info_sufficiency: float = Field(ge=0, le=4, description="信息充分性")
+    info_density: float = Field(ge=0, le=4, description="信息密度")
+
+    # 三、叙事结构（小计最高 14；由原 15 收紧）
+    narr_structure: float = Field(ge=0, le=6, description="故事结构")
+    narr_paragraphs: float = Field(ge=0, le=5, description="段落组织")
+    narr_pacing: float = Field(ge=0, le=3, description="节奏控制")
+
+    # 四、语言与文笔（小计最高 18；由原 20 及六项上限一并收紧）
+    lang_fluency: float = Field(ge=0, le=3, description="语言流畅度")
+    lang_conciseness: float = Field(ge=0, le=3, description="表达精炼度")
+    lang_literary: float = Field(ge=0, le=4, description="文笔质量")
+    lang_controlled_expansion: float = Field(ge=0, le=4, description="控制性扩写能力")
+    lang_detail: float = Field(ge=0, le=2, description="细节还原与强化")
+    lang_style: float = Field(ge=0, le=2, description="风格一致性")
+
+    # 五、情感表达（小计最高 9；由原 10 收紧）
+    emo_authenticity: float = Field(ge=0, le=5, description="情感真实度")
+    emo_depth: float = Field(ge=0, le=4, description="情感深度")
+
+    # 六、人物建模（小计最高 9；由原 10 收紧）
+    char_understanding: float = Field(ge=0, le=4, description="人物理解")
+    char_consistency: float = Field(ge=0, le=3, description="人物一致性")
+    char_integration: float = Field(ge=0, le=2, description="人物融入度")
+
+    # 七、连贯性（小计最高 4；由原 5 收紧）
+    coh_timeline: float = Field(ge=0, le=2, description="时间线一致性")
+    coh_cross_chapter: float = Field(ge=0, le=2, description="跨章节关联")
+
+    # 八、表达丰富度（小计最高 5）
+    rich_analogy: float = Field(ge=0, le=3, description="类比与引用")
+    rich_diversity: float = Field(ge=0, le=2, description="表达多样性")
+
+    # 九、出版就绪度（小计最高 4；由原 5 收紧）
+    pub_editorial_cost: float = Field(ge=0, le=2, description="编辑成本")
+    pub_completeness: float = Field(ge=0, le=2, description="完整度")

    total_score: float = Field(ge=0, le=100)
-    authenticity_score: float = Field(default=0, ge=0, le=25)
-    information_score: float = Field(default=0, ge=0, le=15)
-    narrative_score: float = Field(default=0, ge=0, le=15)
-    language_score: float = Field(default=0, ge=0, le=20)
-    emotion_score: float = Field(default=0, ge=0, le=10)
-    character_score: float = Field(default=0, ge=0, le=10)
-    coherence_score: float = Field(default=0, ge=0, le=5)
-    richness_score: float = Field(default=0, ge=0, le=5)
-    publish_ready_score: float = Field(default=0, ge=0, le=5)
    rationale: str = ""
+
+    authenticity_score: float = Field(default=0, ge=0, le=23)
+    information_score: float = Field(default=0, ge=0, le=14)
+    narrative_score: float = Field(default=0, ge=0, le=14)
+    language_score: float = Field(default=0, ge=0, le=18)
+    emotion_score: float = Field(default=0, ge=0, le=9)
+    character_score: float = Field(default=0, ge=0, le=9)
+    coherence_score: float = Field(default=0, ge=0, le=4)
+    richness_score: float = Field(default=0, ge=0, le=5)
+    publish_ready_score: float = Field(default=0, ge=0, le=4)
+
+    @model_validator(mode="after")
+    def _sync_aggregates_and_total(self) -> Self:
+        authenticity = (
+            self.mem_fidelity
+            + self.mem_factual_accuracy
+            + self.mem_factual_coverage
+            + self.mem_traceability
+        )
+        information = (
+            self.info_slot_coverage + self.info_sufficiency + self.info_density
+        )
+        narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
+        language = (
+            self.lang_fluency
+            + self.lang_conciseness
+            + self.lang_literary
+            + self.lang_controlled_expansion
+            + self.lang_detail
+            + self.lang_style
+        )
+        emotion = self.emo_authenticity + self.emo_depth
+        character = (
+            self.char_understanding + self.char_consistency + self.char_integration
+        )
+        coherence = self.coh_timeline + self.coh_cross_chapter
+        richness = self.rich_analogy + self.rich_diversity
+        publish = self.pub_editorial_cost + self.pub_completeness
+        expected = (
+            authenticity
+            + information
+            + narrative
+            + language
+            + emotion
+            + character
+            + coherence
+            + richness
+            + publish
+        )
+        if abs(expected - self.total_score) > 0.51:
+            raise ValueError(
+                f"total_score ({self.total_score}) 与分项合计 ({expected:.2f}) 不一致"
+            )
+        object.__setattr__(self, "authenticity_score", authenticity)
+        object.__setattr__(self, "information_score", information)
+        object.__setattr__(self, "narrative_score", narrative)
+        object.__setattr__(self, "language_score", language)
+        object.__setattr__(self, "emotion_score", emotion)
+        object.__setattr__(self, "character_score", character)
+        object.__setattr__(self, "coherence_score", coherence)
+        object.__setattr__(self, "richness_score", richness)
+        object.__setattr__(self, "publish_ready_score", publish)
+        return self
--- a/api/app/features/evaluation/judge_service.py
+++ b/api/app/features/evaluation/judge_service.py
@@ -26,6 +26,37 @@ _CONV_MAX = 8192
 _CONV_JUDGE_JSON_MAX = 2048
 _MEMOIR_MAX = 12000
 _COMPARE_STREAM_MAX = 6144
+_MEMOIR_EVIDENCE_MAX = 12000
+
+
+def _build_memoir_judge_prompt(
+    *,
+    memoir_markdown: str,
+    source_transcript: str = "",
+    reference_memoir_markdown: str = "",
+    evidence_notes: str = "",
+) -> str:
+    """Assemble an evidence-aware memoir judging prompt."""
+    source = (source_transcript or "").strip()
+    reference = (reference_memoir_markdown or "").strip()
+    notes = (evidence_notes or "").strip()
+    sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
+    if notes:
+        sections.extend(["【评审说明】", notes[:1200], ""])
+    if source:
+        sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
+    else:
+        sections.extend(
+            [
+                "【原始访谈/证据】",
+                "无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性，必须保守打分，不得凭空高分。",
+                "",
+            ]
+        )
+    if reference:
+        sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
+    sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
+    return "\n".join(sections)


 class EvalJudgeService:
@@ -124,7 +155,7 @@ class EvalJudgeService:
 {r_json}

 请依次撰写：
-1) 两段对话在整体体验上的主要差异（共情、追问、重复感、自然度等）；
+1) 两段对话在整体体验上的主要差异（情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等）；
 2) B 相对 A 的优点与不足；
 3) 若 B 在关键维度明显弱于 A，给出可操作的改进方向（系统提示、访谈策略、模型或温度等）。

@@ -154,14 +185,22 @@ class EvalJudgeService:
            logger.warning("conversation compare stream failed: {}", e)
            yield f"\n\n[流式输出中断：{e}]"

-    async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
+    async def judge_memoir(
+        self,
+        *,
+        memoir_markdown: str,
+        source_transcript: str = "",
+        reference_memoir_markdown: str = "",
+        evidence_notes: str = "",
+    ) -> MemoirJudgeOutput | None:
        if not self._llm:
            return None
-        prompt = f"""{MEMOIR_JUDGE_INSTRUCTIONS}
-
-【回忆录正文】
-{memoir_markdown[:_MEMOIR_MAX]}
-"""
+        prompt = _build_memoir_judge_prompt(
+            memoir_markdown=memoir_markdown,
+            source_transcript=source_transcript,
+            reference_memoir_markdown=reference_memoir_markdown,
+            evidence_notes=evidence_notes,
+        )
        try:
            return await allm_json_call(
                self._llm,
--- a/api/app/features/evaluation/rubrics/conversation_v1.py
+++ b/api/app/features/evaluation/rubrics/conversation_v1.py
@@ -1,31 +1,60 @@
-"""对话评审 rubric 文本（v1）。"""
+"""对话评审 rubric 文本（v1 · 访谈「情绪强化版」100 分）。"""

-TURN_JUDGE_INSTRUCTIONS = """你是「岁月留书」访谈对话质量评审。根据下面维度给本轮 AI 回复打分（0-100 为 total_score，各子分上限已注明，子分之和应与 total_score 大体一致）。
+_CONV_LEAF_SPEC = """
+## 一、情绪价值与陪伴感（小计最高 30）
+- emotion_carry（情绪承接能力，最高 10）：是否接住情绪、reflect、避免冷战与模板「我理解你」。
+- empathy_depth（共情深度，最高 8）：情绪类型是否准、语境贴合、非空洞安慰。
+- emotion_safety（情绪安全感，最高 6）：非评判、尊重、敏感话题语气、可跳过。
+- emotion_guidance（情绪引导能力，最高 6）：引导感受、关键节点追问情绪、表达更具体。

-维度（参考）：
- 情绪承接与共情（emotion_score，最高 30）
- 信息获取与追问（information_score，最高 20）
- 结构化访谈推进（structure_score，最高 10）
- 提问质量（question_score，最高 10）
- 人物理解与一致性（persona_score，最高 10）
- 重复抑制（repetition_score，最高 10）：是否重复了上 1～2 轮已问过的问题或同一资料槽；高度重复则低分
- 自然流畅（naturalness_score，最高 10）：是否像朋友聊天；有无不必要采访腔、总结腔、流程感
+## 二、信息获取能力（小计最高 25）
+- fact_mining（关键事实挖掘，最高 8）
+- info_completeness_guide（信息完整性引导，最高 8）
+- info_depth_mining（信息深度挖掘，最高 9）：为何、动机与影响。

-输出 JSON：**json** 字段名如下：
-total_score, emotion_score, information_score, structure_score, question_score, persona_score, repetition_score, naturalness_score, rationale
+## 三、人物建模能力（小计最高 15）
+- persona_understanding（人物理解，最高 7）
+- persona_consistency_verify（人物一致性验证，最高 4）：矛盾澄清。
+- persona_expression_guide（人物表达引导，最高 4）：「你是谁」层面。

-只输出 JSON。"""
+## 四、结构化引导（小计最高 15）
+- interview_structure（访谈结构，最高 6）：阶段与逻辑。
+- context_memory（上下文记忆，最高 5）：关联前文；**重复盘问、同一槽位反复**在此项扣分。
+- rhythm_control（节奏控制，最高 4）：自然；**采访腔、总结腔、流程感**在此项与情绪项综合体现。
+
+## 五、提问质量（小计最高 15）
+- question_quality（问题质量，最高 7）：开放、具体。
+- follow_up_depth（追问能力，最高 5）
+- non_leading（非引导性，最高 3）
+
+输出 JSON 字段（仅这些键；务必含 rationale）：
+emotion_carry, empathy_depth, emotion_safety, emotion_guidance,
+fact_mining, info_completeness_guide, info_depth_mining,
+persona_understanding, persona_consistency_verify, persona_expression_guide,
+interview_structure, context_memory, rhythm_control,
+question_quality, follow_up_depth, non_leading,
+total_score, rationale
+
+`total_score` 必须等于上述 15 个细项之和（满分 100）。
+聚合分 emotion_score、information_score、persona_score、structure_score、question_score 可不填（服务端会重算）。
+只输出 JSON。
+"""


-CONV_JUDGE_INSTRUCTIONS = """你是访谈整段对话评审。给定完整 transcript（用户与 AI 多轮），打一个综合 total_score（0-100）。
+TURN_JUDGE_INSTRUCTIONS = (
+    "你是「岁月留书」访谈对话质量评审，按下列 **情绪强化版** rubric 为本轮 AI 回复打分。\n"
+    + _CONV_LEAF_SPEC
+)

-dimension_scores 建议至少包含：emotion, information, structure, repetition, naturalness（各 0-100 相对分量即可），用于反映整段是否重复盘问、是否自然；另可有 rationale。

-只输出 JSON：total_score, dimension_scores, rationale。"""
+CONV_JUDGE_INSTRUCTIONS = (
+    "你是访谈整段对话评审。给定完整 transcript（用户与 AI 多轮），按与单轮**相同**的 15 项细项与满分上限，"
+    "对**整段对话表现**打一次分；`total_score` 为细项之和（100）。\n" + _CONV_LEAF_SPEC
+)


 COMPARE_CONV_STREAM_HINT = """你是访谈对话评测专家。下面给出一份「回放/新测」完整对话 transcript 及其整体评分（JSON）。请用中文直接写正文（不要用 JSON）：
-1) 对这段对话的整体评价与风险点；
+1) 对这段对话的整体评价与风险点（对照情绪承接、信息挖掘、人物、结构、提问质量等）；
 2) 可操作的改进建议（提示词、流程、模型参数等）。

 笔调简洁、可执行。"""
--- a/api/app/features/evaluation/rubrics/memoir_v1.py
+++ b/api/app/features/evaluation/rubrics/memoir_v1.py
@@ -1,11 +1,73 @@
-"""回忆录成稿评审 rubric 文本（v1）。"""
+"""回忆录成稿评审 rubric 文本（v1 · 子项上限合计 100 分制）。

-MEMOIR_JUDGE_INSTRUCTIONS = """你是「岁月留书」回忆录成稿评审。根据真实性与覆盖、信息质量、叙事结构、语言文笔、情感、人物、连贯性、表达丰富度、出版就绪等，给出分项分（上限与 total_score 满分 100 一致）。
+说明：原产品表九个大类上限之和为 110；本 rubric 将各细项上限整档收紧，使九类小计之和为 100，
+便于与 `total_score` 直接一致，无需再折算。
+"""

-输出 JSON 字段：
-total_score,
-authenticity_score, information_score, narrative_score, language_score,
-emotion_score, character_score, coherence_score, richness_score, publish_ready_score,
-rationale
+_MEMOIR_RUBRIC_BODY = """
+你必须按下列一级维度与子项及其**满分上限**打分；**全部细项得分之和须等于 `total_score`，且满分合计为 100**。

+## 一、真实性与覆盖（小计最高 23）
+1. mem_fidelity（记忆忠实度，最高 9）：hallucination、夸大/弱化/改写、因果关系、未证实推测、AI 补全编造。
+2. mem_factual_accuracy（事实准确性，最高 5）：时间、人物关系、顺序、内部矛盾、数值细节。
+3. mem_factual_coverage（事实覆盖率，最高 5）：关键/高情感事件、关键人物与细节是否遗漏。
+4. mem_traceability（记忆可追溯性，最高 4）：与原始对话映射、来源模糊、证据与语义保持。
+
+## 二、信息质量（小计最高 14）
+5. info_slot_coverage（槽位覆盖度，最高 6）
+6. info_sufficiency（信息充分性，最高 4）
+7. info_density（信息密度，最高 4）
+
+## 三、叙事结构（小计最高 14）
+8. narr_structure（故事结构，最高 6）
+9. narr_paragraphs（段落组织，最高 5）
+10. narr_pacing（节奏控制，最高 3）
+
+## 四、语言与文笔（小计最高 18）
+11. lang_fluency（语言流畅度，最高 3）
+12. lang_conciseness（表达精炼度，最高 3）
+13. lang_literary（文笔质量，最高 4）
+14. lang_controlled_expansion（控制性扩写，最高 4）
+15. lang_detail（细节还原与强化，最高 2）
+16. lang_style（风格一致性，最高 2）
+
+## 五、情感表达（小计最高 9）
+17. emo_authenticity（情感真实度，最高 5）
+18. emo_depth（情感深度，最高 4）
+
+## 六、人物建模（小计最高 9）
+19. char_understanding（人物理解，最高 4）
+20. char_consistency（人物一致性，最高 3）
+21. char_integration（人物融入度，最高 2）
+
+## 七、连贯性（小计最高 4）
+22. coh_timeline（时间线一致性，最高 2）
+23. coh_cross_chapter（跨章节关联，最高 2）
+
+## 八、表达丰富度（小计最高 5）
+24. rich_analogy（类比与引用，最高 3）
+25. rich_diversity（表达多样性，最高 2）
+
+## 九、出版就绪度（小计最高 4）
+26. pub_editorial_cost（编辑成本，最高 2）
+27. pub_completeness（完整度，最高 2）
+
+输出 JSON 字段（仅这些键；分值浮点；务必含 rationale 中文简述）：
+mem_fidelity, mem_factual_accuracy, mem_factual_coverage, mem_traceability,
+info_slot_coverage, info_sufficiency, info_density,
+narr_structure, narr_paragraphs, narr_pacing,
+lang_fluency, lang_conciseness, lang_literary, lang_controlled_expansion, lang_detail, lang_style,
+emo_authenticity, emo_depth,
+char_understanding, char_consistency, char_integration,
+coh_timeline, coh_cross_chapter,
+rich_analogy, rich_diversity,
+pub_editorial_cost, pub_completeness,
+total_score, rationale
+
+一级聚合分 authenticity_score、information_score、narrative_score、language_score、emotion_score、character_score、coherence_score、richness_score、publish_ready_score 可不填（服务端会按细项重算）。
 只输出 JSON。"""
+
+MEMOIR_JUDGE_INSTRUCTIONS = (
+    "你是「岁月留书」回忆录成稿评审，必须严格按照下列 rubric 打分。\n"
+    + _MEMOIR_RUBRIC_BODY
+)