life-echo/api/app/features/evaluation/judge_manual_service.py

"""手动触发评测台评审（智谱 / DeepSeek；不写 eval_runs；Playground 对话评分写入 conversations 表）。"""

from __future__ import annotations

import asyncio
import copy
import re
from collections.abc import AsyncIterator
from datetime import datetime, timezone
from typing import Any

from sqlalchemy.ext.asyncio import AsyncSession

from app.core.config import settings
from app.core.dependencies import (
    EvalJudgeProvider,
    build_eval_judge_llm_spec,
)
from app.core.logging import get_logger
from app.features.conversation import repo as conversation_repo
from app.features.evaluation.conversation_compare_summary import (
    build_conversation_compare_summary,
)
from app.features.evaluation.errors import (
    EvaluationBadRequestError,
    EvaluationNotFoundError,
)
from app.features.evaluation.eval_trace_service import EvalTraceService
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import (
    EvalJudgeService,
    eval_judge_compare_bundle_caps,
    eval_judge_conversation_transcript_max_chars_for_context,
)
from app.features.evaluation.memoir_compare_summary import (
    build_memoir_compare_summary,
)
from app.features.evaluation.schemas import MemoirSectionBaselineOut
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.transcript_for_judge import (
    assistant_text_for_eval_display,
    format_eval_turn_block,
    format_export_turns_with_labels,
    format_session_messages_with_turn_labels,
    pair_session_messages_to_turns,
)
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user

logger = get_logger(__name__)

_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40  # memoir_snapshot 等仍限幅
_PRIOR_TRANSCRIPT_MAX_CHARS = 8000

_JUDGE_CONFIG_HINT = (
    "评审未配置：智谱需 eval_judge_api_key 或 zhipu_api_key；"
    "DeepSeek 需 deepseek_api_key（或 llm_api_key）"
)


def _make_eval_judge(
    judge_provider: EvalJudgeProvider,
    judge_model: str | None,
) -> tuple[EvalJudgeService | None, str]:
    spec = build_eval_judge_llm_spec(judge_provider, judge_model)
    if not spec or not spec.llm:
        return None, ""
    return (
        EvalJudgeService(
            spec.llm,
            context_window_tokens=spec.context_window_tokens,
        ),
        spec.resolved_model,
    )


def _strip_baseline_judge_errors(errs: list[Any]) -> list[str]:
    out: list[str] = []
    for e in errs:
        s = str(e) if e is not None else ""
        if not s.strip():
            continue
        if (
            "基准整体打分失败" in s
            or s.startswith("baseline_glm5:")
            or "baseline_glm5_failed:" in s
        ):
            continue
        out.append(s)
    return out


async def _iter_turn_judgments_for_turns(
    judge: EvalJudgeService,
    turns: list[tuple[str, str]],
    *,
    sse_event: str,
) -> AsyncIterator[dict[str, Any]]:
    """与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。"""
    prior_blocks: list[str] = []
    for idx, (u_raw, ai_raw) in enumerate(turns):
        u = (u_raw or "").strip()
        reply = assistant_text_for_eval_display(str(ai_raw))
        prior = "\n\n".join(prior_blocks)
        if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS:
            prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:]
        tj = await judge.judge_turn(
            prior_transcript=prior,
            user_utterance=u,
            assistant_reply=reply,
            turn_index_0=idx,
        )
        yield {
            "event": sse_event,
            "turn_index": idx,
            "ok": tj is not None,
            "judge": tj.model_dump() if tj else None,
        }
        prior_blocks.append(format_eval_turn_block(idx, u, reply))


def _clip_md_for_judge(text: str, max_chars: int | None = None) -> str:
    cap = (
        max_chars
        if max_chars is not None
        else max(1000, int(settings.eval_judge_memoir_body_max_chars))
    )
    s = (text or "").strip()
    if len(s) <= cap:
        return s
    return f"{s[:cap]}\n\n…（已截断供评审）"


async def _conversation_transcript_for_manual(
    db: AsyncSession, conversation_id: str
) -> str:
    rows = await conversation_repo.get_conversation_messages(conversation_id, db)
    return format_session_messages_with_turn_labels(rows)


def _normalize_title_key(title: str) -> str:
    t = (title or "").strip().lower()
    t = re.sub(r"^#+\s*", "", t)
    return re.sub(r"\s+", " ", t)


def _baseline_for_chapter_title(
    baselines: list[MemoirSectionBaselineOut],
    chapter_title: str,
    index: int,
) -> MemoirSectionBaselineOut | None:
    if baselines:
        key = _normalize_title_key(chapter_title)
        for b in baselines:
            if _normalize_title_key(b.title) == key:
                return b
        if 0 <= index < len(baselines):
            return baselines[index]
    return None


class EvalJudgeManualService:
    def __init__(self, db: AsyncSession) -> None:
        self._db = db

    async def _persist_playground_conversation_judge(
        self, conversation_id: str, bundle: dict[str, Any]
    ) -> None:
        try:
            row = await conversation_repo.set_playground_conversation_judge_json(
                conversation_id, self._db, bundle
            )
            if row is not None:
                await self._db.commit()
        except Exception:
            logger.exception(
                "persist playground_conversation_judge_json failed conversation_id={}",
                conversation_id,
            )

    async def judge_conversation(
        self,
        conversation_id: str,
        fixture_filename: str | None,
        *,
        judge_provider: EvalJudgeProvider = "zhipu",
        judge_model: str | None = None,
    ) -> dict[str, Any]:
        cid = (conversation_id or "").strip()
        if not cid:
            raise EvaluationBadRequestError("conversation_id is required")

        catalog = SessionCatalogService(self._db)
        dialogue = await catalog.get_session_dialogue(cid)
        if not dialogue:
            raise EvaluationNotFoundError("conversation not found")

        replay_transcript = format_session_messages_with_turn_labels(
            list(dialogue.messages)
        )
        if not replay_transcript.strip():
            raise EvaluationBadRequestError("no messages to judge")

        fn = (fixture_filename or "").strip() or None
        baseline_transcript = ""
        if fn:
            try:
                turns, _ = read_user_export_fixture(fn)
                baseline_transcript = format_export_turns_with_labels(turns)
            except ValueError as e:
                raise EvaluationBadRequestError(str(e)) from e
            except FileNotFoundError as e:
                raise EvaluationNotFoundError("fixture not found") from e

        errors: list[str] = []
        judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
        if not judge:
            raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
        baseline_judge_dict: dict[str, Any] | None = None
        if baseline_transcript.strip():
            baseline_result = await judge.judge_conversation_result(
                full_transcript=baseline_transcript
            )
            bj = baseline_result.output
            if bj:
                baseline_judge_dict = bj.model_dump()
            else:
                errors.append(
                    f"baseline_glm5_failed: {baseline_result.error or 'unknown error'}"
                )
        elif fn:
            errors.append("baseline_transcript_empty")

        replay_result = await judge.judge_conversation_result(
            full_transcript=replay_transcript
        )
        rj = replay_result.output
        replay_judge_dict = rj.model_dump() if rj else None
        if not rj:
            errors.append(
                f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
            )

        _cmp_total, _cmp_per_side = eval_judge_compare_bundle_caps(judge._ctx_tokens)
        bundle: dict[str, Any] = {
            "version": 1,
            "judged_at": datetime.now(timezone.utc).isoformat(),
            "fixture_filename": fn,
            "baseline_judge": baseline_judge_dict,
            "replay_judge": replay_judge_dict,
            "baseline_turn_judges": {},
            "replay_turn_judges": {},
            "compare_summary": build_conversation_compare_summary(
                baseline_judge=bj,
                replay_judge=rj,
                baseline_transcript=baseline_transcript,
                replay_transcript=replay_transcript,
                conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                    judge._ctx_tokens
                ),
                compare_cap_total=_cmp_total,
                compare_per_side_cap=_cmp_per_side,
                fixture_filename=fn,
            ),
            "compare_markdown": "",
            "errors": list(errors),
            "warnings": [],
            "options": {
                "include_turn_judges": False,
                "include_baseline_turn_judges": False,
                "judge_provider": judge_provider,
                "judge_model": resolved_model,
            },
        }
        await self._persist_playground_conversation_judge(cid, bundle)

        return {
            "conversation_id": cid,
            "fixture_filename": fn,
            "baseline_transcript": baseline_transcript,
            "replay_transcript": replay_transcript,
            "baseline_judge": baseline_judge_dict,
            "replay_judge": replay_judge_dict,
            "compare_summary": bundle.get("compare_summary"),
            "errors": errors,
        }

    async def iter_conversation_judge_sse(
        self,
        conversation_id: str,
        fixture_filename: str | None,
        *,
        include_turn_judges: bool = False,
        include_baseline_turn_judges: bool = False,
        judge_provider: EvalJudgeProvider = "zhipu",
        judge_model: str | None = None,
    ) -> AsyncIterator[dict[str, Any]]:
        """供 SSE：先整体基准分、再整体回放分，可选逐轮分，再流式对比与建议；成功后写入 playground 字段。"""
        acc: dict[str, Any] = {
            "version": 1,
            "fixture_filename": None,
            "baseline_judge": None,
            "replay_judge": None,
            "baseline_turn_judges": {},
            "replay_turn_judges": {},
            "compare_summary": None,
            "compare_markdown": "",
            "errors": [],
            "warnings": [],
            "options": {
                "include_turn_judges": include_turn_judges,
                "include_baseline_turn_judges": include_baseline_turn_judges,
                "judge_provider": judge_provider,
                "judge_model": "",
            },
        }
        cid = (conversation_id or "").strip()
        if not cid:
            yield {
                "event": "error",
                "phase": "validate",
                "message": "conversation_id is required",
            }
            return

        catalog = SessionCatalogService(self._db)
        dialogue = await catalog.get_session_dialogue(cid)
        if not dialogue:
            yield {
                "event": "error",
                "phase": "load",
                "message": "conversation not found",
            }
            return

        replay_transcript = format_session_messages_with_turn_labels(
            list(dialogue.messages)
        )
        if not replay_transcript.strip():
            yield {"event": "error", "phase": "load", "message": "no messages to judge"}
            return

        fn = (fixture_filename or "").strip() or None
        baseline_transcript = ""
        export_turns: list[tuple[str, str]] | None = None
        if fn:
            try:
                turns, _ = read_user_export_fixture(fn)
                export_turns = list(turns)
                baseline_transcript = format_export_turns_with_labels(turns)
            except ValueError as e:
                yield {"event": "error", "phase": "fixture", "message": str(e)}
                return
            except FileNotFoundError:
                yield {
                    "event": "error",
                    "phase": "fixture",
                    "message": "fixture not found",
                }
                return

        judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
        if not judge:
            yield {
                "event": "error",
                "phase": "config",
                "message": _JUDGE_CONFIG_HINT,
            }
            return

        acc["options"]["judge_model"] = resolved_model
        acc["fixture_filename"] = fn
        _sse_cmp_total, _sse_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
        persist = True
        try:
            yield {
                "event": "meta",
                "conversation_id": cid,
                "fixture_filename": fn,
                "judge_provider": judge_provider,
                "judge_model": resolved_model,
            }

            if not baseline_transcript.strip():
                wmsg = "未提供基准 MD 或基准无文本：仅对回放对话打分并输出单侧改进建议"
                acc["warnings"].append(wmsg)
                yield {"event": "warning", "message": wmsg}

            baseline_judge = None
            if baseline_transcript.strip():
                baseline_result = await judge.judge_conversation_result(
                    full_transcript=baseline_transcript
                )
                baseline_judge = baseline_result.output
                acc["baseline_judge"] = (
                    baseline_judge.model_dump() if baseline_judge else None
                )
                yield {
                    "event": "baseline_judge",
                    "ok": baseline_judge is not None,
                    "judge": acc["baseline_judge"],
                }
                if not baseline_judge:
                    err = (
                        f"基准整体打分失败：{baseline_result.error}"
                        if baseline_result.error
                        else "基准整体打分失败（密钥、限流或 JSON 解析失败，见服务端日志）"
                    )
                    acc["errors"].append(err)
                    yield {
                        "event": "error",
                        "phase": "baseline_glm5",
                        "message": err,
                    }
                elif (
                    include_baseline_turn_judges
                    and export_turns
                    and baseline_judge is not None
                ):
                    yield {"event": "meta", "phase": "baseline_turn_judges_start"}
                    async for row in _iter_turn_judgments_for_turns(
                        judge,
                        export_turns,
                        sse_event="baseline_turn_judge",
                    ):
                        if row.get("event") == "baseline_turn_judge":
                            idx = row.get("turn_index")
                            if isinstance(idx, (int, float)):
                                acc["baseline_turn_judges"][str(int(idx))] = row.get(
                                    "judge"
                                )
                        yield row
            else:
                acc["baseline_judge"] = None
                yield {
                    "event": "baseline_judge",
                    "ok": False,
                    "skipped": True,
                    "judge": None,
                }

            replay_result = await judge.judge_conversation_result(
                full_transcript=replay_transcript
            )
            replay_judge = replay_result.output
            acc["replay_judge"] = replay_judge.model_dump() if replay_judge else None
            acc["compare_summary"] = build_conversation_compare_summary(
                baseline_judge=baseline_judge,
                replay_judge=replay_judge,
                baseline_transcript=baseline_transcript,
                replay_transcript=replay_transcript,
                conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                    judge._ctx_tokens
                ),
                compare_cap_total=_sse_cmp_total,
                compare_per_side_cap=_sse_cmp_per,
                fixture_filename=fn,
            )
            yield {
                "event": "replay_judge",
                "ok": replay_judge is not None,
                "judge": acc["replay_judge"],
            }
            yield {"event": "compare_summary", "summary": acc["compare_summary"]}
            if not replay_judge:
                err = (
                    f"回放对话整体打分失败：{replay_result.error}"
                    if replay_result.error
                    else "回放对话整体打分失败（限流或 JSON 解析失败，见服务端日志）"
                )
                acc["errors"].append(err)
                yield {
                    "event": "error",
                    "phase": "replay_glm5",
                    "message": err,
                }
                yield {"event": "done"}
                return

            if include_turn_judges:
                replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
                if replay_pairs:
                    yield {"event": "meta", "phase": "replay_turn_judges_start"}
                    async for row in _iter_turn_judgments_for_turns(
                        judge,
                        replay_pairs,
                        sse_event="replay_turn_judge",
                    ):
                        if row.get("event") == "replay_turn_judge":
                            idx = row.get("turn_index")
                            if isinstance(idx, (int, float)):
                                acc["replay_turn_judges"][str(int(idx))] = row.get(
                                    "judge"
                                )
                        yield row

            async for piece in judge.stream_conversation_compare(
                baseline_transcript=baseline_transcript,
                replay_transcript=replay_transcript,
                baseline_judge=baseline_judge,
                replay_judge=replay_judge,
            ):
                if piece:
                    acc["compare_markdown"] += piece
                    yield {"event": "compare_delta", "text": piece}

            yield {"event": "done"}
        finally:
            if persist:
                acc["judged_at"] = datetime.now(timezone.utc).isoformat()
                await self._persist_playground_conversation_judge(cid, acc)

    async def retry_baseline_conversation_judge(
        self,
        conversation_id: str,
        fixture_filename: str | None,
        *,
        include_baseline_turn_judges: bool = False,
        judge_provider: EvalJudgeProvider = "zhipu",
        judge_model: str | None = None,
    ) -> dict[str, Any]:
        """仅重试导出基线整体 GLM 分（及可选基线逐轮），并基于已有 replay 分重新生成对比稿。"""
        cid = (conversation_id or "").strip()
        if not cid:
            raise EvaluationBadRequestError("conversation_id is required")

        catalog = SessionCatalogService(self._db)
        dialogue = await catalog.get_session_dialogue(cid)
        if not dialogue:
            raise EvaluationNotFoundError("conversation not found")

        replay_transcript = format_session_messages_with_turn_labels(
            list(dialogue.messages)
        )
        if not replay_transcript.strip():
            raise EvaluationBadRequestError("no messages to judge")

        fn = (fixture_filename or "").strip() or None
        if not fn:
            raise EvaluationBadRequestError(
                "请选择基线 MD（fixture_filename）后再重试基准分"
            )

        try:
            turns, _ = read_user_export_fixture(fn)
            export_turns = list(turns)
            baseline_transcript = format_export_turns_with_labels(turns)
        except ValueError as e:
            raise EvaluationBadRequestError(str(e)) from e
        except FileNotFoundError:
            raise EvaluationNotFoundError("fixture not found") from None

        if not baseline_transcript.strip():
            raise EvaluationBadRequestError("baseline transcript is empty")

        prev = await catalog.get_playground_conversation_judge_json(cid)
        if not prev or not isinstance(prev, dict):
            raise EvaluationBadRequestError(
                "服务端没有已保存的评分草稿：请先跑一次「自动评分（流式）」"
                "直到回放侧打分完成，再使用本重试。"
            )
        raw_replay = prev.get("replay_judge")
        if not raw_replay or not isinstance(raw_replay, dict):
            raise EvaluationBadRequestError(
                "已保存结果中缺少回放侧整体分：请先完成流式评分中的回放打分阶段再重试基准。"
            )

        try:
            replay_model = ConversationJudgeOutput.model_validate(raw_replay)
        except Exception as e:
            raise EvaluationBadRequestError(
                "已保存的回放评分格式无效，请重新跑一次完整流式评分。"
            ) from e

        judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
        if not judge:
            raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
        _rt_cmp_total, _rt_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
        baseline_result = await judge.judge_conversation_result(
            full_transcript=baseline_transcript
        )
        if not baseline_result.output:
            err = baseline_result.error or "unknown error"
            msg = f"基准整体打分失败：{err}"
            errs = _strip_baseline_judge_errors(list(prev.get("errors") or []))
            errs.append(msg)
            return {
                "ok": False,
                "error": err,
                "message": msg,
                "baseline_judge": None,
                "replay_judge": raw_replay,
                "compare_summary": build_conversation_compare_summary(
                    baseline_judge=None,
                    replay_judge=replay_model,
                    baseline_transcript=baseline_transcript,
                    replay_transcript=replay_transcript,
                    conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                        judge._ctx_tokens
                    ),
                    compare_cap_total=_rt_cmp_total,
                    compare_per_side_cap=_rt_cmp_per,
                    fixture_filename=fn,
                ),
                "compare_markdown": "",
                "baseline_turn_judges": {},
                "errors": errs,
            }

        baseline_judge = baseline_result.output
        acc: dict[str, Any] = copy.deepcopy(prev)
        acc.setdefault("version", 1)
        acc["baseline_judge"] = baseline_judge.model_dump()
        acc["fixture_filename"] = fn
        acc["errors"] = _strip_baseline_judge_errors(list(acc.get("errors") or []))
        opts = acc.setdefault("options", {})
        if isinstance(opts, dict):
            opts["judge_provider"] = judge_provider
            opts["judge_model"] = resolved_model

        if include_baseline_turn_judges and export_turns:
            acc["baseline_turn_judges"] = {}
            async for row in _iter_turn_judgments_for_turns(
                judge,
                export_turns,
                sse_event="baseline_turn_judge",
            ):
                idx = row.get("turn_index")
                if isinstance(idx, (int, float)) and row.get("judge") is not None:
                    acc["baseline_turn_judges"][str(int(idx))] = row["judge"]

        acc["compare_markdown"] = ""
        acc["compare_summary"] = build_conversation_compare_summary(
            baseline_judge=baseline_judge,
            replay_judge=replay_model,
            baseline_transcript=baseline_transcript,
            replay_transcript=replay_transcript,
            conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
                judge._ctx_tokens
            ),
            compare_cap_total=_rt_cmp_total,
            compare_per_side_cap=_rt_cmp_per,
            fixture_filename=fn,
        )
        async for piece in judge.stream_conversation_compare(
            baseline_transcript=baseline_transcript,
            replay_transcript=replay_transcript,
            baseline_judge=baseline_judge,
            replay_judge=replay_model,
        ):
            if piece:
                acc["compare_markdown"] += piece

        acc["judged_at"] = datetime.now(timezone.utc).isoformat()
        await self._persist_playground_conversation_judge(cid, acc)

        return {
            "ok": True,
            "error": None,
            "message": None,
            "baseline_judge": acc["baseline_judge"],
            "replay_judge": acc.get("replay_judge"),
            "compare_summary": acc.get("compare_summary"),
            "compare_markdown": acc.get("compare_markdown") or "",
            "baseline_turn_judges": acc.get("baseline_turn_judges") or {},
            "errors": acc["errors"],
        }

    async def judge_memoir_for_user(
        self,
        user_id: str,
        baseline_sections: list[MemoirSectionBaselineOut] | None,
        *,
        judge_provider: EvalJudgeProvider = "zhipu",
        judge_model: str | None = None,
    ) -> dict[str, Any]:
        uid = (user_id or "").strip()
        if not uid:
            raise EvaluationBadRequestError("user_id is required")

        judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
        if not judge:
            raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
        baselines = list(baseline_sections or [])
        trace_svc = EvalTraceService(self._db)

        def _chapter_evidence_notes(
            lineage_tier: str,
            evidence_summary: str,
            truncated: bool,
            dropped: list[str],
        ) -> str:
            drops = ",".join(dropped[:12]) if dropped else ""
            return (
                "严格按文档打分；真实性、事实覆盖率、可追溯性以本章节绑定的证据闭包为准。"
                f" lineage_tier={lineage_tier}；evidence_summary={evidence_summary}；"
                f" prompt_truncated={truncated}；dropped_sections={drops or 'none'}"
            )

        chapter_results: list[dict[str, Any]] = []
        errors: list[str] = []
        try:
            chapters = await get_chapters_for_memoir_list(
                uid, self._db, active_only=True, is_new_only=None
            )
        except Exception as e:
            logger.exception("manual memoir: chapter list failed user_id={}", uid)
            errors.append(f"加载章节列表失败：{e}")
            chapters = []

        def _nonempty_chapters(cols: list[Any]) -> int:
            return sum(
                1 for x in cols if (getattr(x, "canonical_markdown", None) or "").strip()
            )

        conc = max(1, min(32, int(settings.eval_judge_memoir_chapter_concurrency)))
        logger.info(
            "event=eval_memoir_judge_start user_id={} judge_provider={} judge_model={} "
            "chapters_total={} chapters_nonempty={} chapter_concurrency={}",
            uid,
            judge_provider,
            resolved_model or "",
            len(chapters),
            _nonempty_chapters(chapters[:_MAX_EVAL_CHAPTERS]),
            conc,
        )

        prepared: list[dict[str, Any]] = []
        enum_idx = 0
        for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
            body = (ch.canonical_markdown or "").strip()
            if not body:
                continue
            bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
            baseline_excerpt = ""
            if bl and (bl.body or "").strip():
                baseline_excerpt = _clip_md_for_judge(
                    bl.body,
                    max_chars=max(
                        1000, int(settings.eval_judge_memoir_evidence_max_chars)
                    ),
                )
            md = f"# 章节：{ch.title}\n\n{_clip_md_for_judge(body)}"
            try:
                cb = await trace_svc.build_chapter_bundle(uid, ch)
                formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
                fm = formatted.format_meta
                prepared.append(
                    {
                        "enum_idx": enum_idx,
                        "ch": ch,
                        "bl": bl,
                        "md": md,
                        "baseline_excerpt": baseline_excerpt,
                        "formatted": formatted,
                        "cb2": cb2,
                        "fm": fm,
                    }
                )
                enum_idx += 1
            except Exception as e:
                logger.exception(
                    "manual memoir: chapter prepare failed user_id={} chapter_id={}",
                    uid,
                    ch.id,
                )
                label = str(ch.title or ch.id)
                errors.append(f"章节「{label}」证据打包失败：{e}")

        sem = asyncio.Semaphore(conc)

        async def _judge_one(payload: dict[str, Any]) -> dict[str, Any]:
            async with sem:
                ch = payload["ch"]
                formatted = payload["formatted"]
                cb2 = payload["cb2"]
                fm = payload["fm"]
                baseline_excerpt = payload["baseline_excerpt"]
                md = payload["md"]
                bl = payload["bl"]
                ch_label = str(ch.title or ch.id)
                row_errs: list[str] = []
                try:
                    cj_res = await judge.judge_memoir_result(
                        memoir_markdown=md,
                        source_transcript=formatted.source_transcript,
                        structured_evidence=formatted.structured_evidence,
                        reference_memoir_markdown=baseline_excerpt,
                        evidence_notes=_chapter_evidence_notes(
                            cb2.lineage_tier,
                            formatted.evidence_summary,
                            fm.truncated,
                            fm.dropped_sections,
                        ),
                    )
                except Exception as e:
                    logger.exception(
                        "manual memoir: chapter judge failed user_id={} chapter_id={}",
                        uid,
                        ch.id,
                    )
                    return {
                        "enum_idx": payload["enum_idx"],
                        "order_index": ch.order_index,
                        "row": None,
                        "errors": [
                            *row_errs,
                            f"章节「{ch_label}」评审失败：{e}",
                        ],
                    }
                cj = cj_res.output
                row: dict[str, Any] = {
                    "id": ch.id,
                    "title": ch.title,
                    "order_index": ch.order_index,
                    "baseline_title": bl.title if bl else None,
                    "lineage_tier": cb2.lineage_tier,
                    "evidence_summary": formatted.evidence_summary,
                    "evidence_trace": cb2.model_dump(),
                    "format_meta": fm.model_dump(),
                    "judge": cj.model_dump() if cj else None,
                }
                if cj_res.error:
                    row["judge_error"] = cj_res.error
                    row_errs.append(f"章节「{ch_label}」LLM 评审失败：{cj_res.error}")
                    logger.info(
                        "event=eval_memoir_chapter_judge_failed user_id={} chapter_id={} msg={}",
                        uid,
                        ch.id,
                        cj_res.error,
                    )
                elif not cj:
                    row["judge_error"] = "empty_output"
                    row_errs.append(f"章节「{ch_label}」评审返回空结果")
                    logger.info(
                        "event=eval_memoir_chapter_judge_empty user_id={} chapter_id={}",
                        uid,
                        ch.id,
                    )
                return {
                    "enum_idx": payload["enum_idx"],
                    "order_index": ch.order_index,
                    "row": row,
                    "errors": row_errs,
                }

        judged = await asyncio.gather(*[_judge_one(p) for p in prepared])
        judged.sort(
            key=lambda r: (
                r["order_index"]
                if r["order_index"] is not None
                else 10**9,
                r["enum_idx"],
            )
        )
        for r in judged:
            errors.extend(r["errors"])
            if r["row"] is not None:
                chapter_results.append(r["row"])

        story_results: list[dict[str, Any]] = []

        warnings: list[str] = []
        if not chapter_results and not errors:
            warnings.append(
                "未发现可评分的回忆录章节。请确认该用户存在 active 章节且 "
                "canonical_markdown 非空；需要与导出对照时请加载带章节的 user export 作为基线。"
            )

        logger.info(
            "event=eval_memoir_judge_done user_id={} chapter_rows={} story_rows={} "
            "errors={} warnings={}",
            uid,
            len(chapter_results),
            0,
            len(errors),
            len(warnings),
        )

        return {
            "user_id": uid,
            "judge_provider": judge_provider,
            "judge_model": resolved_model or "",
            "chapter_results": chapter_results,
            "story_results": story_results,
            "errors": errors,
            "warnings": warnings,
        }

    async def iter_memoir_chapter_judge_sse(
        self,
        user_id: str,
        baseline_sections: list[MemoirSectionBaselineOut] | None,
        *,
        judge_provider: EvalJudgeProvider = "zhipu",
        judge_model: str | None = None,
    ) -> AsyncIterator[dict[str, Any]]:
        """Streaming SSE: one event per chapter judge result, concurrent LLM calls."""
        uid = (user_id or "").strip()
        if not uid:
            yield {"event": "error", "phase": "validate", "message": "user_id is required"}
            return

        judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
        if not judge:
            yield {"event": "error", "phase": "config", "message": _JUDGE_CONFIG_HINT}
            return

        baselines = list(baseline_sections or [])
        trace_svc = EvalTraceService(self._db)

        def _chapter_evidence_notes(
            lineage_tier: str, evidence_summary: str, truncated: bool, dropped: list[str]
        ) -> str:
            drops = ",".join(dropped[:12]) if dropped else ""
            return (
                "严格按文档打分；真实性、事实覆盖率、可追溯性以本章节绑定的证据闭包为准。"
                f" lineage_tier={lineage_tier}；evidence_summary={evidence_summary}；"
                f" prompt_truncated={truncated}；dropped_sections={drops or 'none'}"
            )

        try:
            chapters = await get_chapters_for_memoir_list(
                uid, self._db, active_only=True, is_new_only=None
            )
        except Exception as e:
            logger.exception("manual memoir stream: chapter list failed user_id={}", uid)
            yield {"event": "error", "phase": "load", "message": f"加载章节列表失败：{e}"}
            return

        yield {
            "event": "meta",
            "user_id": uid,
            "judge_provider": judge_provider,
            "judge_model": resolved_model or "",
            "total_chapters": len(chapters),
        }

        prepared: list[dict[str, Any]] = []
        for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
            body = (ch.canonical_markdown or "").strip()
            if not body:
                continue
            bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
            baseline_excerpt = ""
            if bl and (bl.body or "").strip():
                baseline_excerpt = _clip_md_for_judge(
                    bl.body,
                    max_chars=max(
                        1000, int(settings.eval_judge_memoir_evidence_max_chars)
                    ),
                )
            md = f"# 章节：{ch.title}\n\n{_clip_md_for_judge(body)}"
            try:
                cb = await trace_svc.build_chapter_bundle(uid, ch)
                formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
                fm = formatted.format_meta
                prepared.append({
                    "ch": ch, "bl": bl, "md": md,
                    "baseline_excerpt": baseline_excerpt,
                    "formatted": formatted, "cb2": cb2, "fm": fm,
                })
            except Exception as e:
                logger.exception(
                    "manual memoir stream: chapter prepare failed user_id={} chapter_id={}",
                    uid, ch.id,
                )
                yield {
                    "event": "chapter_error",
                    "chapter_id": ch.id,
                    "title": ch.title,
                    "message": f"证据打包失败：{e}",
                }

        if not prepared:
            yield {
                "event": "warning",
                "message": "未发现可评分的回忆录章节（成稿为空或无 active 章节）。",
            }

        yield {"event": "chapters_prepared", "count": len(prepared)}

        conc = max(1, min(32, int(settings.eval_judge_memoir_chapter_concurrency)))
        sem = asyncio.Semaphore(conc)
        result_queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue()

        async def _judge_one(idx: int, payload: dict[str, Any]) -> None:
            async with sem:
                ch = payload["ch"]
                formatted = payload["formatted"]
                cb2 = payload["cb2"]
                fm = payload["fm"]
                baseline_excerpt = payload["baseline_excerpt"]
                md = payload["md"]
                bl = payload["bl"]
                ev_notes = _chapter_evidence_notes(
                    cb2.lineage_tier,
                    formatted.evidence_summary,
                    fm.truncated,
                    fm.dropped_sections,
                )

                baseline_judge_obj = None
                baseline_judge_dict = None
                baseline_error: str | None = None
                if baseline_excerpt:
                    try:
                        bl_md = f"# 章节：{bl.title if bl else ch.title}\n\n{baseline_excerpt}"
                        bl_res = await judge.judge_memoir_result(
                            memoir_markdown=bl_md,
                            source_transcript=formatted.source_transcript,
                            structured_evidence=formatted.structured_evidence,
                            evidence_notes=ev_notes,
                        )
                        baseline_judge_obj = bl_res.output
                        baseline_judge_dict = (
                            baseline_judge_obj.model_dump()
                            if baseline_judge_obj
                            else None
                        )
                        if bl_res.error:
                            baseline_error = bl_res.error
                    except Exception as exc:
                        logger.warning(
                            "memoir stream: baseline judge failed ch={} err={}",
                            ch.id, exc,
                        )
                        baseline_error = str(exc)

                try:
                    cj_res = await judge.judge_memoir_result(
                        memoir_markdown=md,
                        source_transcript=formatted.source_transcript,
                        structured_evidence=formatted.structured_evidence,
                        reference_memoir_markdown=baseline_excerpt,
                        evidence_notes=ev_notes,
                    )
                except Exception as e:
                    logger.exception(
                        "manual memoir stream: chapter judge failed user_id={} chapter_id={}",
                        uid, ch.id,
                    )
                    await result_queue.put({
                        "event": "chapter_error",
                        "chapter_id": ch.id,
                        "title": ch.title,
                        "message": f"评审失败：{e}",
                    })
                    return
                cj = cj_res.output
                compare_summary = build_memoir_compare_summary(
                    baseline_judge=baseline_judge_obj,
                    chapter_judge=cj,
                )
                row: dict[str, Any] = {
                    "id": ch.id,
                    "title": ch.title,
                    "order_index": ch.order_index,
                    "baseline_title": bl.title if bl else None,
                    "lineage_tier": cb2.lineage_tier,
                    "evidence_summary": formatted.evidence_summary,
                    "evidence_trace": cb2.model_dump(),
                    "format_meta": fm.model_dump(),
                    "baseline_judge": baseline_judge_dict,
                    "judge": cj.model_dump() if cj else None,
                    "compare_summary": compare_summary,
                }
                if baseline_error:
                    row["baseline_judge_error"] = baseline_error
                if cj_res.error:
                    row["judge_error"] = cj_res.error
                if not cj and not cj_res.error:
                    row["judge_error"] = "empty_output"
                await result_queue.put({
                    "event": "chapter_judge",
                    "index": idx,
                    "chapter": row,
                    "ok": cj is not None,
                })

        tasks = [asyncio.create_task(_judge_one(i, p)) for i, p in enumerate(prepared)]

        finished = 0
        total = len(tasks)
        while finished < total:
            item = await result_queue.get()
            if item is not None:
                yield item
            finished_now = sum(1 for t in tasks if t.done())
            if finished_now > finished:
                finished = finished_now

        for t in tasks:
            await t

        while not result_queue.empty():
            item = result_queue.get_nowait()
            if item is not None:
                yield item

        yield {"event": "done"}

    async def memoir_snapshot(self, user_id: str) -> dict[str, Any]:
        uid = (user_id or "").strip()
        if not uid:
            raise EvaluationBadRequestError("user_id is required")

        chapters_out: list[dict[str, Any]] = []
        stories_out: list[dict[str, Any]] = []
        try:
            chapters = await get_chapters_for_memoir_list(
                uid, self._db, active_only=True, is_new_only=None
            )
            for ch in chapters[:_MAX_EVAL_CHAPTERS]:
                chapters_out.append(
                    {
                        "id": ch.id,
                        "title": ch.title,
                        "category": ch.category,
                        "order_index": ch.order_index,
                        "canonical_markdown": ch.canonical_markdown,
                    }
                )
        except Exception as e:
            logger.warning("memoir snapshot chapters failed: {}", e)
        try:
            stories = await get_stories_for_user(self._db, uid, status="active")
            for st in stories[:_MAX_EVAL_STORIES]:
                stories_out.append(
                    {
                        "id": st.id,
                        "title": st.title,
                        "stage": st.stage,
                        "canonical_markdown": st.canonical_markdown,
                    }
                )
        except Exception as e:
            logger.warning("memoir snapshot stories failed: {}", e)

        return {
            "user_id": uid,
            "chapters": chapters_out,
            "stories": stories_out,
        }