api/app/features/evaluation/execution_service.py

"""执行单次评测 run 与整实验（供 Celery / 内联调试）。"""

from __future__ import annotations

from datetime import datetime, timezone
from typing import Any

from sqlalchemy.ext.asyncio import AsyncSession

from app.core.config import settings
from app.core.db import AsyncSessionLocal
from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider
from app.core.logging import get_logger
from app.features.evaluation import repo as eval_repo
from app.features.evaluation.candidate_runner import (
    EvalCandidateRunner,
    simple_memoir_from_transcript,
)
from app.features.evaluation.eval_trace_service import EvalTraceService
from app.features.evaluation.gate_report_service import gate_result_to_details
from app.features.evaluation.gating_service import compute_gate
from app.features.evaluation.judge_service import EvalJudgeService
from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
from app.features.evaluation.transcript_for_judge import (
    assistant_text_for_eval_display,
    format_eval_turn_block,
)

logger = get_logger(__name__)

_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000


def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
    s = (text or "").strip()
    if len(s) <= max_chars:
        return s
    return f"{s[:max_chars]}\n\n…（已截断供评审）"


def _composite(
    conv: float | None, mem: float | None, weights: dict[str, Any] | None
) -> float | None:
    """合成总分；缺失的一侧不计为 0，避免把评审失败误标为极差。

    仅一侧有分：返回该侧原始分（不乘权重），表示当前 run 仅完成了部分评审维度。
    """
    w = weights or {}
    wc = float(w.get("conversation", 0.5))
    wm = float(w.get("memoir", 0.5))
    has_c = conv is not None
    has_m = mem is not None
    if not has_c and not has_m:
        return None
    if has_c and has_m:
        return float(wc) * float(conv) + float(wm) * float(mem)
    if has_c:
        return float(conv)
    return float(mem)


def _utterances_for_case(case: EvalCase) -> list[str]:
    raw = case.user_utterances or []
    return [str(u).strip() for u in raw if str(u).strip()]


def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
    s = (text or "").strip()
    if len(s) <= max_chars:
        return s
    return f"{s[:max_chars]}\n\n…（访谈证据已截断）"


async def execute_eval_run(
    db: AsyncSession,
    *,
    run: EvalRun,
    case: EvalCase,
    version: EvalVersion,
) -> None:
    fresh_run = await db.get(EvalRun, run.id)
    if not fresh_run:
        return
    if (fresh_run.status or "").lower() == "completed":
        logger.info("eval run skip already completed run_id={}", fresh_run.id)
        return
    run = fresh_run

    if not settings.eval_execution_enabled:
        await eval_repo.update_run(
            db,
            run,
            status="failed",
            error_message="EVAL_EXECUTION_ENABLED=false",
            completed_at=datetime.now(timezone.utc),
        )
        return

    utterances = _utterances_for_case(case)
    if not utterances:
        await eval_repo.update_run(
            db,
            run,
            status="failed",
            error_message="empty user_utterances",
            completed_at=datetime.now(timezone.utc),
        )
        return

    await eval_repo.update_run(
        db,
        run,
        status="running",
        started_at=datetime.now(timezone.utc),
        error_message=None,
    )
    await db.commit()

    provider_llm = getattr(get_llm_provider(), "langchain_llm", None)
    if provider_llm is None:
        await eval_repo.update_run(
            db,
            run,
            status="failed",
            error_message="生产 LLM 未配置",
            completed_at=datetime.now(timezone.utc),
        )
        await db.commit()
        return

    judge_llm = get_eval_judge_langchain_llm()
    judge = EvalJudgeService(judge_llm)
    runner = EvalCandidateRunner(provider_llm)
    cfg = version.config_json if isinstance(version.config_json, dict) else None

    try:
        replies, latencies = await runner.replay_utterances(
            utterances,
            version_config=cfg,
            temperature=settings.eval_candidate_temperature,
        )
    except Exception as e:
        logger.exception("eval replay failed: {}", e)
        await eval_repo.update_run(
            db,
            run,
            status="failed",
            error_message=str(e)[:2000],
            completed_at=datetime.now(timezone.utc),
        )
        await db.commit()
        return

    transcript_parts: list[str] = []
    for i, u in enumerate(utterances):
        if i >= len(replies):
            break
        transcript_parts.append(
            format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i]))
        )
    prior_blocks: list[str] = []
    for idx, u in enumerate(utterances):
        if idx >= len(replies):
            break
        reply = assistant_text_for_eval_display(replies[idx])
        lat = latencies[idx] if idx < len(latencies) else None
        prior = "\n\n".join(prior_blocks)
        if len(prior) > 8000:
            prior = prior[-8000:]
        tj = await judge.judge_turn(
            prior_transcript=prior,
            user_utterance=u,
            assistant_reply=reply,
            turn_index_0=idx,
        )
        scores = tj.model_dump() if tj else None
        rationale = tj.rationale if tj else None
        await eval_repo.add_turn(
            db,
            run_id=str(run.id),
            turn_index=idx,
            user_utterance=u,
            assistant_reply=replies[idx],
            duration_ms=lat,
            judge_scores_json=scores,
            judge_rationale=rationale,
        )
        await db.commit()
        prior_blocks.append(format_eval_turn_block(idx, u, reply))

    full_transcript = "\n\n".join(transcript_parts)
    conv_out = await judge.judge_conversation(full_transcript=full_transcript)
    conv_total = conv_out.total_score if conv_out else None

    memoir_md = simple_memoir_from_transcript(utterances, replies)
    source_transcript = _trim_evidence_text(full_transcript)
    reference_memoir = (case.reference_memoir_markdown or "").strip()
    synthetic_notes = (
        "本项为 replay 合成的短 memoir：证据闭包仅为重放对话 transcript（无 library artifact lineage）。"
        f" turns={len(utterances)}"
    )
    mem_out = await judge.judge_memoir(
        memoir_markdown=memoir_md,
        source_transcript=source_transcript,
        structured_evidence=(
            "（结构化记忆证据：自动化 replay 路径未绑定用户 memory chunk/fact/timeline/summary。）"
        ),
        reference_memoir_markdown=reference_memoir,
        evidence_notes=synthetic_notes,
    )

    chapter_entries: list[dict[str, Any]] = []
    story_entries: list[dict[str, Any]] = []
    uid = (case.source_user_id or "").strip()
    trace_svc = EvalTraceService(db)

    def _library_evidence_notes(
        lineage_tier: str,
        evidence_summary: str,
        truncated: bool,
        dropped: list[str],
    ) -> str:
        drops = ",".join(dropped[:12]) if dropped else ""
        return (
            "library artifact 评审：以证据闭包为准；若 lineage 为 fallback 或不足须保守打分。"
            f" lineage_tier={lineage_tier}；summary={evidence_summary}；"
            f" prompt_truncated={truncated}；dropped_sections={drops or 'none'}。"
            " 单章节/单故事节选；跨篇上下文不足写入 insufficient_evidence。"
        )

    if uid:
        from app.features.memoir.repo import get_chapters_for_memoir_list
        from app.features.story.repo import get_stories_for_user

        try:
            chapters = await get_chapters_for_memoir_list(
                uid, db, active_only=True, is_new_only=None
            )
            for ch in chapters[:_MAX_EVAL_CHAPTERS]:
                body = (ch.canonical_markdown or "").strip()
                if not body:
                    continue
                md = f"# 章节：{ch.title}\n\n{_clip_md_for_judge(body)}"
                cb = await trace_svc.build_chapter_bundle(uid, ch)
                formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
                fm = formatted.format_meta
                cj = await judge.judge_memoir(
                    memoir_markdown=md,
                    source_transcript=formatted.source_transcript,
                    structured_evidence=formatted.structured_evidence,
                    reference_memoir_markdown=reference_memoir,
                    evidence_notes=_library_evidence_notes(
                        cb2.lineage_tier,
                        formatted.evidence_summary,
                        fm.truncated,
                        fm.dropped_sections,
                    ),
                )
                chapter_entries.append(
                    {
                        "id": ch.id,
                        "title": ch.title,
                        "order_index": ch.order_index,
                        "lineage_tier": cb2.lineage_tier,
                        "evidence_summary": formatted.evidence_summary,
                        "evidence_trace": cb2.model_dump(),
                        "format_meta": fm.model_dump(),
                        "judge": cj.model_dump() if cj else None,
                    }
                )
        except Exception as e:
            logger.warning("eval chapter judges skipped: {}", e)

        try:
            stories = await get_stories_for_user(db, uid, status="active")
            for st in stories[:_MAX_EVAL_STORIES]:
                body = (st.canonical_markdown or "").strip()
                if not body:
                    continue
                md = f"# 故事：{st.title}\n\n{_clip_md_for_judge(body)}"
                sb = await trace_svc.build_story_bundle(uid, str(st.id))
                formatted, sb2 = await trace_svc.format_story_bundle(sb)
                fm = formatted.format_meta
                sj = await judge.judge_memoir(
                    memoir_markdown=md,
                    source_transcript=formatted.source_transcript,
                    structured_evidence=formatted.structured_evidence,
                    reference_memoir_markdown=reference_memoir,
                    evidence_notes=_library_evidence_notes(
                        sb2.lineage_tier,
                        formatted.evidence_summary,
                        fm.truncated,
                        fm.dropped_sections,
                    ),
                )
                story_entries.append(
                    {
                        "id": st.id,
                        "title": st.title,
                        "stage": st.stage,
                        "lineage_tier": sb2.lineage_tier,
                        "evidence_summary": formatted.evidence_summary,
                        "evidence_trace": sb2.model_dump(),
                        "format_meta": fm.model_dump(),
                        "judge": sj.model_dump() if sj else None,
                    }
                )
        except Exception as e:
            logger.warning("eval story judges skipped: {}", e)

    synth_scores: list[float] = []
    if mem_out is not None:
        synth_scores.append(float(mem_out.total_score))

    library_scores: list[float] = []
    for row in chapter_entries:
        j = row.get("judge")
        if isinstance(j, dict) and j.get("total_score") is not None:
            library_scores.append(float(j["total_score"]))
    for row in story_entries:
        j = row.get("judge")
        if isinstance(j, dict) and j.get("total_score") is not None:
            library_scores.append(float(j["total_score"]))

    def _mean(xs: list[float]) -> float:
        return sum(xs) / len(xs) if xs else 0.0

    if synth_scores and library_scores:
        mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores)
    elif synth_scores:
        mem_total = _mean(synth_scores)
    elif library_scores:
        mem_total = _mean(library_scores)
    else:
        mem_total = None

    exp = await eval_repo.get_experiment(db, str(run.experiment_id))
    weights = (
        exp.composite_weights_json
        if exp and isinstance(exp.composite_weights_json, dict)
        else None
    )
    comp = _composite(conv_total, mem_total, weights)

    bundle: dict[str, Any] = {
        "conversation_judge": conv_out.model_dump() if conv_out else None,
        "memoir_judge": mem_out.model_dump() if mem_out else None,
        "synthetic_memoir_judge": mem_out.model_dump() if mem_out else None,
        "chapters": chapter_entries,
        "stories": story_entries,
        "judge_meta": {
            "conversation_judge_ok": conv_out is not None,
            "memoir_synthetic_ok": mem_out is not None,
            "memoir_synth_scores_n": len(synth_scores),
            "memoir_library_scores_n": len(library_scores),
            "synthetic_memoir_lineage_tier": "replay_transcript_only",
            "synthetic_memoir_evidence_summary": (
                f"replay_turns={len(utterances)};structured_memory=unbound"
            ),
            "memoir_aggregate_rule": (
                "synthetic_memoir_judge_plus_library_memoir_judge_weighted_mean"
                if synth_scores and library_scores
                else ("synthetic_memoir_only" if synth_scores else "library_memoir_only")
            ),
        },
    }
    await eval_repo.update_run(
        db,
        run,
        status="completed",
        memoir_markdown=memoir_md,
        conversation_score_total=conv_total,
        memoir_score_total=mem_total,
        composite_score=comp,
        judge_bundle_json=bundle,
        completed_at=datetime.now(timezone.utc),
    )
    await db.commit()


async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None:
    runs = await eval_repo.list_runs_for_experiment(db, experiment_id)
    exp = await eval_repo.get_experiment(db, experiment_id)
    if not exp:
        return
    cases = await eval_repo.list_cases(db, str(exp.regression_set_id))

    incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
    if incomplete:
        return

    failed = [r for r in runs if str(r.status) == "failed"]
    if failed:
        await eval_repo.update_experiment(
            db,
            exp,
            status="failed",
            error_message="部分 run 失败",
            completed_at=datetime.now(timezone.utc),
        )
        await db.commit()
        return

    gr = compute_gate(cases=cases, runs=runs)
    await eval_repo.upsert_gate_verdict(
        db,
        experiment_id=experiment_id,
        passed=gr.passed,
        mean_composite_delta=gr.mean_delta,
        protected_regressions_json=gr.protected_regressions,
        details_json=gate_result_to_details(gr),
    )
    await eval_repo.update_experiment(
        db,
        exp,
        status="completed",
        completed_at=datetime.now(timezone.utc),
    )
    await db.commit()


async def execute_experiment_full(experiment_id: str) -> None:
    from app.core.redis_lock import acquire_redis_lock, release_redis_lock

    lock_key = f"lock:eval_experiment:{experiment_id}"
    lock_handle = acquire_redis_lock(lock_key, ttl_seconds=7200)
    if lock_handle is None:
        logger.warning(
            "eval experiment already running or lock busy experiment_id={}",
            experiment_id,
        )
        return

    try:
        async with AsyncSessionLocal() as db:
            exp = await eval_repo.get_experiment(db, experiment_id)
            if not exp:
                return
            await eval_repo.update_experiment(db, exp, status="running")
            await db.commit()

            cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
            base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
            cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
            if base_v is None or cand_v is None:
                await eval_repo.update_experiment(
                    db,
                    exp,
                    status="failed",
                    error_message="version 不存在",
                    completed_at=datetime.now(timezone.utc),
                )
                await db.commit()
                return

            for case in cases:
                for side, ver in ("baseline", base_v), ("candidate", cand_v):
                    run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
                    if not run:
                        run = await eval_repo.create_run(
                            db,
                            experiment_id=experiment_id,
                            case_id=str(case.id),
                            side=side,
                        )
                    await db.commit()
                    await execute_eval_run(db, run=run, case=case, version=ver)

            await _finalize_experiment_gate(db, experiment_id)
    finally:
        release_redis_lock(lock_handle)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								"""执行单次评测 run 与整实验（供 Celery / 内联调试）。"""
 								from __future__ import annotations
 								from datetime import datetime, timezone
 								from typing import Any
 								from sqlalchemy.ext.asyncio import AsyncSession
 								from app.core.config import settings
 								from app.core.db import AsyncSessionLocal
 								from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider
 								from app.core.logging import get_logger
 								from app.features.evaluation import repo as eval_repo
 								from app.features.evaluation.candidate_runner import (
 								    EvalCandidateRunner,
 								    simple_memoir_from_transcript,
 								)
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								from app.features.evaluation.eval_trace_service import EvalTraceService
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								from app.features.evaluation.gate_report_service import gate_result_to_details
 								from app.features.evaluation.gating_service import compute_gate
 								from app.features.evaluation.judge_service import EvalJudgeService
 								from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								from app.features.evaluation.transcript_for_judge import (
 								    assistant_text_for_eval_display,
 								    format_eval_turn_block,
 								)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								logger = get_logger(__name__)
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								_MAX_JUDGE_MARKDOWN_CHARS = 20_000
 								_MAX_EVAL_CHAPTERS = 30
 								_MAX_EVAL_STORIES = 40
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
 								def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
 								    s = (text or "").strip()
 								    if len(s) <= max_chars:
 								        return s
 								    return f"{s[:max_chars]}\n\n…（已截断供评审）"
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								def _composite(
 								    conv: float | None, mem: float | None, weights: dict[str, Any] | None
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								) -> float | None:
 								    """合成总分；缺失的一侧不计为 0，避免把评审失败误标为极差。
 								    仅一侧有分：返回该侧原始分（不乘权重），表示当前 run 仅完成了部分评审维度。
 								    """
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    w = weights or {}
 								    wc = float(w.get("conversation", 0.5))
 								    wm = float(w.get("memoir", 0.5))
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    has_c = conv is not None
 								    has_m = mem is not None
 								    if not has_c and not has_m:
 								        return None
 								    if has_c and has_m:
 								        return float(wc) * float(conv) + float(wm) * float(mem)
 								    if has_c:
 								        return float(conv)
 								    return float(mem)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								def _utterances_for_case(case: EvalCase) -> list[str]:
 								    raw = case.user_utterances or []
 								    return [str(u).strip() for u in raw if str(u).strip()]
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
 								    s = (text or "").strip()
 								    if len(s) <= max_chars:
 								        return s
 								    return f"{s[:max_chars]}\n\n…（访谈证据已截断）"
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								async def execute_eval_run(
 								    db: AsyncSession,
 								    *,
 								    run: EvalRun,
 								    case: EvalCase,
 								    version: EvalVersion,
 								) -> None:
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    fresh_run = await db.get(EvalRun, run.id)
 								    if not fresh_run:
 								        return
 								    if (fresh_run.status or "").lower() == "completed":
 								        logger.info("eval run skip already completed run_id={}", fresh_run.id)
 								        return
 								    run = fresh_run
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    if not settings.eval_execution_enabled:
 								        await eval_repo.update_run(
 								            db,
 								            run,
 								            status="failed",
 								            error_message="EVAL_EXECUTION_ENABLED=false",
 								            completed_at=datetime.now(timezone.utc),
 								        )
 								        return
 								    utterances = _utterances_for_case(case)
 								    if not utterances:
 								        await eval_repo.update_run(
 								            db,
 								            run,
 								            status="failed",
 								            error_message="empty user_utterances",
 								            completed_at=datetime.now(timezone.utc),
 								        )
 								        return
 								    await eval_repo.update_run(
 								        db,
 								        run,
 								        status="running",
 								        started_at=datetime.now(timezone.utc),
 								        error_message=None,
 								    )
 								    await db.commit()
 								    provider_llm = getattr(get_llm_provider(), "langchain_llm", None)
 								    if provider_llm is None:
 								        await eval_repo.update_run(
 								            db,
 								            run,
 								            status="failed",
 								            error_message="生产 LLM 未配置",
 								            completed_at=datetime.now(timezone.utc),
 								        )
 								        await db.commit()
 								        return
 								    judge_llm = get_eval_judge_langchain_llm()
 								    judge = EvalJudgeService(judge_llm)
 								    runner = EvalCandidateRunner(provider_llm)
 								    cfg = version.config_json if isinstance(version.config_json, dict) else None
 								    try:
 								        replies, latencies = await runner.replay_utterances(
 								            utterances,
 								            version_config=cfg,
 								            temperature=settings.eval_candidate_temperature,
 								        )
 								    except Exception as e:
 								        logger.exception("eval replay failed: {}", e)
 								        await eval_repo.update_run(
 								            db,
 								            run,
 								            status="failed",
 								            error_message=str(e)[:2000],
 								            completed_at=datetime.now(timezone.utc),
 								        )
 								        await db.commit()
 								        return
 								    transcript_parts: list[str] = []
 								    for i, u in enumerate(utterances):
 								        if i >= len(replies):
 								            break
-												refactor(chat): AI-native prompts, remove interview heuristics

- Drop interview_reply_length and utterance_substance; always run stage LLM
  and memory retrieval when enabled; trim Settings fields and .env.example.
- Replace guided/opening prompts with compact fact blocks plus unified
  behavior guidance; slim background_voice and persona to tone hints.
- InterviewAgent uses fixed chat_interview max_tokens/chars/segments.

Also includes stacked work: profile followup/extract path, evaluation rubric
and judge schema updates, transcript SPLIT handling in execution service,
user export markdown split tests, and golden case fixture.

											
										
										
											2026-04-06 22:22:50 +08:00
+								        transcript_parts.append(
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            format_eval_turn_block(i, u, assistant_text_for_eval_display(replies[i]))
-												refactor(chat): AI-native prompts, remove interview heuristics

- Drop interview_reply_length and utterance_substance; always run stage LLM
  and memory retrieval when enabled; trim Settings fields and .env.example.
- Replace guided/opening prompts with compact fact blocks plus unified
  behavior guidance; slim background_voice and persona to tone hints.
- InterviewAgent uses fixed chat_interview max_tokens/chars/segments.

Also includes stacked work: profile followup/extract path, evaluation rubric
and judge schema updates, transcript SPLIT handling in execution service,
user export markdown split tests, and golden case fixture.

											
										
										
											2026-04-06 22:22:50 +08:00
+								        )
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    prior_blocks: list[str] = []
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    for idx, u in enumerate(utterances):
 								        if idx >= len(replies):
 								            break
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        reply = assistant_text_for_eval_display(replies[idx])
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        lat = latencies[idx] if idx < len(latencies) else None
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        prior = "\n\n".join(prior_blocks)
 								        if len(prior) > 8000:
 								            prior = prior[-8000:]
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        tj = await judge.judge_turn(
 								            prior_transcript=prior,
 								            user_utterance=u,
 								            assistant_reply=reply,
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            turn_index_0=idx,
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        )
 								        scores = tj.model_dump() if tj else None
 								        rationale = tj.rationale if tj else None
 								        await eval_repo.add_turn(
 								            db,
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								            run_id=str(run.id),
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								            turn_index=idx,
 								            user_utterance=u,
-												refactor(chat): AI-native prompts, remove interview heuristics

- Drop interview_reply_length and utterance_substance; always run stage LLM
  and memory retrieval when enabled; trim Settings fields and .env.example.
- Replace guided/opening prompts with compact fact blocks plus unified
  behavior guidance; slim background_voice and persona to tone hints.
- InterviewAgent uses fixed chat_interview max_tokens/chars/segments.

Also includes stacked work: profile followup/extract path, evaluation rubric
and judge schema updates, transcript SPLIT handling in execution service,
user export markdown split tests, and golden case fixture.

											
										
										
											2026-04-06 22:22:50 +08:00
+								            assistant_reply=replies[idx],
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								            duration_ms=lat,
 								            judge_scores_json=scores,
 								            judge_rationale=rationale,
 								        )
 								        await db.commit()
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        prior_blocks.append(format_eval_turn_block(idx, u, reply))
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								    full_transcript = "\n\n".join(transcript_parts)
 								    conv_out = await judge.judge_conversation(full_transcript=full_transcript)
 								    conv_total = conv_out.total_score if conv_out else None
 								    memoir_md = simple_memoir_from_transcript(utterances, replies)
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    source_transcript = _trim_evidence_text(full_transcript)
 								    reference_memoir = (case.reference_memoir_markdown or "").strip()
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    synthetic_notes = (
 								        "本项为 replay 合成的短 memoir：证据闭包仅为重放对话 transcript（无 library artifact lineage）。"
 								        f" turns={len(utterances)}"
 								    )
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    mem_out = await judge.judge_memoir(
 								        memoir_markdown=memoir_md,
 								        source_transcript=source_transcript,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								        structured_evidence=(
 								            "（结构化记忆证据：自动化 replay 路径未绑定用户 memory chunk/fact/timeline/summary。）"
 								        ),
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								        reference_memoir_markdown=reference_memoir,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								        evidence_notes=synthetic_notes,
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    )
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
 								    chapter_entries: list[dict[str, Any]] = []
 								    story_entries: list[dict[str, Any]] = []
 								    uid = (case.source_user_id or "").strip()
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    trace_svc = EvalTraceService(db)
 								    def _library_evidence_notes(
 								        lineage_tier: str,
 								        evidence_summary: str,
 								        truncated: bool,
 								        dropped: list[str],
 								    ) -> str:
 								        drops = ",".join(dropped[:12]) if dropped else ""
 								        return (
 								            "library artifact 评审：以证据闭包为准；若 lineage 为 fallback 或不足须保守打分。"
 								            f" lineage_tier={lineage_tier}；summary={evidence_summary}；"
 								            f" prompt_truncated={truncated}；dropped_sections={drops or 'none'}。"
 								            " 单章节/单故事节选；跨篇上下文不足写入 insufficient_evidence。"
 								        )
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								    if uid:
 								        from app.features.memoir.repo import get_chapters_for_memoir_list
 								        from app.features.story.repo import get_stories_for_user
 								        try:
 								            chapters = await get_chapters_for_memoir_list(
 								                uid, db, active_only=True, is_new_only=None
 								            )
 								            for ch in chapters[:_MAX_EVAL_CHAPTERS]:
 								                body = (ch.canonical_markdown or "").strip()
 								                if not body:
 								                    continue
 								                md = f"# 章节：{ch.title}\n\n{_clip_md_for_judge(body)}"
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                cb = await trace_svc.build_chapter_bundle(uid, ch)
 								                formatted, cb2 = await trace_svc.format_chapter_bundle(cb)
 								                fm = formatted.format_meta
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								                cj = await judge.judge_memoir(
 								                    memoir_markdown=md,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                    source_transcript=formatted.source_transcript,
 								                    structured_evidence=formatted.structured_evidence,
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								                    reference_memoir_markdown=reference_memoir,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                    evidence_notes=_library_evidence_notes(
 								                        cb2.lineage_tier,
 								                        formatted.evidence_summary,
 								                        fm.truncated,
 								                        fm.dropped_sections,
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								                    ),
 								                )
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								                chapter_entries.append(
 								                    {
 								                        "id": ch.id,
 								                        "title": ch.title,
 								                        "order_index": ch.order_index,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                        "lineage_tier": cb2.lineage_tier,
 								                        "evidence_summary": formatted.evidence_summary,
 								                        "evidence_trace": cb2.model_dump(),
 								                        "format_meta": fm.model_dump(),
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								                        "judge": cj.model_dump() if cj else None,
 								                    }
 								                )
 								        except Exception as e:
 								            logger.warning("eval chapter judges skipped: {}", e)
 								        try:
 								            stories = await get_stories_for_user(db, uid, status="active")
 								            for st in stories[:_MAX_EVAL_STORIES]:
 								                body = (st.canonical_markdown or "").strip()
 								                if not body:
 								                    continue
 								                md = f"# 故事：{st.title}\n\n{_clip_md_for_judge(body)}"
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                sb = await trace_svc.build_story_bundle(uid, str(st.id))
 								                formatted, sb2 = await trace_svc.format_story_bundle(sb)
 								                fm = formatted.format_meta
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								                sj = await judge.judge_memoir(
 								                    memoir_markdown=md,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                    source_transcript=formatted.source_transcript,
 								                    structured_evidence=formatted.structured_evidence,
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								                    reference_memoir_markdown=reference_memoir,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                    evidence_notes=_library_evidence_notes(
 								                        sb2.lineage_tier,
 								                        formatted.evidence_summary,
 								                        fm.truncated,
 								                        fm.dropped_sections,
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								                    ),
 								                )
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								                story_entries.append(
 								                    {
 								                        "id": st.id,
 								                        "title": st.title,
 								                        "stage": st.stage,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                        "lineage_tier": sb2.lineage_tier,
 								                        "evidence_summary": formatted.evidence_summary,
 								                        "evidence_trace": sb2.model_dump(),
 								                        "format_meta": fm.model_dump(),
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								                        "judge": sj.model_dump() if sj else None,
 								                    }
 								                )
 								        except Exception as e:
 								            logger.warning("eval story judges skipped: {}", e)
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    synth_scores: list[float] = []
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								    if mem_out is not None:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        synth_scores.append(float(mem_out.total_score))
 								    library_scores: list[float] = []
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								    for row in chapter_entries:
 								        j = row.get("judge")
 								        if isinstance(j, dict) and j.get("total_score") is not None:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            library_scores.append(float(j["total_score"]))
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								    for row in story_entries:
 								        j = row.get("judge")
 								        if isinstance(j, dict) and j.get("total_score") is not None:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            library_scores.append(float(j["total_score"]))
 								    def _mean(xs: list[float]) -> float:
 								        return sum(xs) / len(xs) if xs else 0.0
 								    if synth_scores and library_scores:
 								        mem_total = 0.5 * _mean(synth_scores) + 0.5 * _mean(library_scores)
 								    elif synth_scores:
 								        mem_total = _mean(synth_scores)
 								    elif library_scores:
 								        mem_total = _mean(library_scores)
 								    else:
 								        mem_total = None
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    exp = await eval_repo.get_experiment(db, str(run.experiment_id))
 								    weights = (
 								        exp.composite_weights_json
 								        if exp and isinstance(exp.composite_weights_json, dict)
 								        else None
 								    )
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    comp = _composite(conv_total, mem_total, weights)
 								    bundle: dict[str, Any] = {
 								        "conversation_judge": conv_out.model_dump() if conv_out else None,
 								        "memoir_judge": mem_out.model_dump() if mem_out else None,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								        "synthetic_memoir_judge": mem_out.model_dump() if mem_out else None,
-												feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)

											
										
										
											2026-04-06 13:45:04 +08:00
+								        "chapters": chapter_entries,
 								        "stories": story_entries,
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        "judge_meta": {
 								            "conversation_judge_ok": conv_out is not None,
 								            "memoir_synthetic_ok": mem_out is not None,
 								            "memoir_synth_scores_n": len(synth_scores),
 								            "memoir_library_scores_n": len(library_scores),
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								            "synthetic_memoir_lineage_tier": "replay_transcript_only",
 								            "synthetic_memoir_evidence_summary": (
 								                f"replay_turns={len(utterances)};structured_memory=unbound"
 								            ),
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            "memoir_aggregate_rule": (
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                "synthetic_memoir_judge_plus_library_memoir_judge_weighted_mean"
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								                if synth_scores and library_scores
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                else ("synthetic_memoir_only" if synth_scores else "library_memoir_only")
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            ),
 								        },
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    }
 								    await eval_repo.update_run(
 								        db,
 								        run,
 								        status="completed",
 								        memoir_markdown=memoir_md,
 								        conversation_score_total=conv_total,
 								        memoir_score_total=mem_total,
 								        composite_score=comp,
 								        judge_bundle_json=bundle,
 								        completed_at=datetime.now(timezone.utc),
 								    )
 								    await db.commit()
 								async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None:
 								    runs = await eval_repo.list_runs_for_experiment(db, experiment_id)
 								    exp = await eval_repo.get_experiment(db, experiment_id)
 								    if not exp:
 								        return
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    if incomplete:
 								        return
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    failed = [r for r in runs if str(r.status) == "failed"]
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    if failed:
 								        await eval_repo.update_experiment(
 								            db,
 								            exp,
 								            status="failed",
 								            error_message="部分 run 失败",
 								            completed_at=datetime.now(timezone.utc),
 								        )
 								        await db.commit()
 								        return
 								    gr = compute_gate(cases=cases, runs=runs)
 								    await eval_repo.upsert_gate_verdict(
 								        db,
 								        experiment_id=experiment_id,
 								        passed=gr.passed,
 								        mean_composite_delta=gr.mean_delta,
 								        protected_regressions_json=gr.protected_regressions,
 								        details_json=gate_result_to_details(gr),
 								    )
 								    await eval_repo.update_experiment(
 								        db,
 								        exp,
 								        status="completed",
 								        completed_at=datetime.now(timezone.utc),
 								    )
 								    await db.commit()
 								async def execute_experiment_full(experiment_id: str) -> None:
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    from app.core.redis_lock import acquire_redis_lock, release_redis_lock
 								    lock_key = f"lock:eval_experiment:{experiment_id}"
 								    lock_handle = acquire_redis_lock(lock_key, ttl_seconds=7200)
 								    if lock_handle is None:
 								        logger.warning(
 								            "eval experiment already running or lock busy experiment_id={}",
 								            experiment_id,
 								        )
 								        return
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    try:
 								        async with AsyncSessionLocal() as db:
 								            exp = await eval_repo.get_experiment(db, experiment_id)
 								            if not exp:
 								                return
 								            await eval_repo.update_experiment(db, exp, status="running")
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								            await db.commit()
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								            cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
 								            base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
 								            cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
 								            if base_v is None or cand_v is None:
 								                await eval_repo.update_experiment(
 								                    db,
 								                    exp,
 								                    status="failed",
 								                    error_message="version 不存在",
 								                    completed_at=datetime.now(timezone.utc),
 								                )
 								                await db.commit()
 								                return
 								            for case in cases:
 								                for side, ver in ("baseline", base_v), ("candidate", cand_v):
 								                    run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
 								                    if not run:
 								                        run = await eval_repo.create_run(
 								                            db,
 								                            experiment_id=experiment_id,
 								                            case_id=str(case.id),
 								                            side=side,
 								                        )
 								                    await db.commit()
 								                    await execute_eval_run(db, run=run, case=case, version=ver)
 								            await _finalize_experiment_gate(db, experiment_id)
 								    finally:
 								        release_redis_lock(lock_handle)