api/app/features/evaluation/judge_service.py

"""评测台评审：智谱 / DeepSeek 等 OpenAI 兼容端点（结构化 JSON）。"""

from __future__ import annotations

from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import Any, Generic, TypeVar

from app.core.config import settings
from app.core.eval_judge_spec import EvalJudgeProvider
from app.core.llm_call import LLMCallError, allm_json_call
from app.core.logging import get_logger
from app.features.evaluation.judge_schemas import (
    ConversationJudgeOutput,
    MemoirJudgeOutput,
    TurnJudgeOutput,
)
from app.features.evaluation.rubrics.conversation_v1 import (
    COMPARE_CONV_STREAM_HINT,
    CONV_JUDGE_INSTRUCTIONS,
    TURN_JUDGE_INSTRUCTIONS,
)
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS

logger = get_logger(__name__)

TJudgeOutput = TypeVar(
    "TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
)

_TURN_MAX = 768
_CONV_JUDGE_JSON_MAX = 2048
_CONV_HEADER = "【完整对话】（每轮以 `[Turn k]` 开头）\n\n"
_COMPARE_STREAM_MAX = 6144


def _eval_judge_prompt_char_pool_for_context(context_window_tokens: int) -> int:
    """整段请求的字符预算（由评审模型 context window 推导，保守）。"""
    toks = (
        int(context_window_tokens)
        - settings.eval_judge_completion_reserve_tokens
        - settings.eval_judge_prompt_budget_safety_tokens
    )
    toks = max(1, toks)
    return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))


def _eval_judge_prompt_char_pool() -> int:
    return _eval_judge_prompt_char_pool_for_context(
        settings.eval_judge_context_window_tokens
    )


def eval_judge_conversation_transcript_max_chars() -> int:
    """整段对话评审：【完整对话】transcript 最大字符数（默认 GLM 上下文）。"""
    if settings.eval_judge_max_transcript_chars > 0:
        return settings.eval_judge_max_transcript_chars
    overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
    return max(1, _eval_judge_prompt_char_pool() - overhead)


def eval_judge_conversation_transcript_max_chars_for_context(
    context_window_tokens: int,
) -> int:
    if settings.eval_judge_max_transcript_chars > 0:
        return settings.eval_judge_max_transcript_chars
    overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
    pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
    return max(1, pool - overhead)


def eval_judge_turn_prior_transcript_max_chars() -> int:
    """逐轮评审：截至上一轮的 transcript 节选上限（默认 GLM 上下文）。"""
    if settings.eval_judge_max_transcript_chars > 0:
        return settings.eval_judge_max_transcript_chars
    static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
    return max(1, _eval_judge_prompt_char_pool() - static)


def eval_judge_turn_prior_transcript_max_chars_for_context(
    context_window_tokens: int,
) -> int:
    if settings.eval_judge_max_transcript_chars > 0:
        return settings.eval_judge_max_transcript_chars
    static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
    pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
    return max(1, pool - static)


def eval_judge_compare_transcript_each_max_chars() -> int:
    """单侧对称参考上限（默认与 settings.eval_judge_context_window_tokens 一致）。"""
    return eval_judge_compare_transcript_each_max_chars_for_context(
        settings.eval_judge_context_window_tokens
    )


def eval_judge_compare_transcript_pair_total_budget_for_context(
    context_window_tokens: int,
) -> int:
    """A/B 同 prompt 时，两份 transcript 合计最大字符数（已扣对比模板与双份 JSON 等开销）。"""
    if settings.eval_judge_max_compare_transcript_chars_each > 0:
        return max(1, 2 * int(settings.eval_judge_max_compare_transcript_chars_each))
    pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
    return max(1, pool - int(settings.eval_judge_compare_prompt_overhead_chars))


def eval_judge_compare_transcript_each_max_chars_for_context(
    context_window_tokens: int,
) -> int:
    """单侧对称上限的参考值（auto 模式下约为合计预算的一半；供兼容与展示）。"""
    if settings.eval_judge_max_compare_transcript_chars_each > 0:
        return int(settings.eval_judge_max_compare_transcript_chars_each)
    total = eval_judge_compare_transcript_pair_total_budget_for_context(
        context_window_tokens
    )
    return max(1, total // 2)


def eval_judge_compare_bundle_caps(
    context_window_tokens: int,
) -> tuple[int, int | None]:
    """返回 (compare_cap_total, per_side_cap|None)，供 Playground 摘要与流式对比共用。"""
    per = int(settings.eval_judge_max_compare_transcript_chars_each or 0)
    if per > 0:
        return max(1, 2 * per), per
    return eval_judge_compare_transcript_pair_total_budget_for_context(
        context_window_tokens
    ), None


def trim_compare_transcript_pair(
    baseline: str,
    replay: str,
    *,
    total_max_chars: int,
    per_side_max_chars: int | None = None,
) -> tuple[str, str, bool, bool]:
    """A/B 对比 prompt 用：在合计预算内尽量保留全文；仅超长时优先从较长的一侧裁尾部。

    若配置了 eval_judge_max_compare_transcript_chars_each，则仍按单侧硬顶（与旧行为一致）。
    """
    b = (baseline or "").strip()
    r = (replay or "").strip()
    if per_side_max_chars is not None and int(per_side_max_chars) > 0:
        cap = int(per_side_max_chars)
        return b[:cap], r[:cap], len(b) > cap, len(r) > cap

    cap_total = max(1, int(total_max_chars))
    if len(b) + len(r) <= cap_total:
        return b, r, False, False

    need_drop = len(b) + len(r) - cap_total
    b2, r2 = b, r
    while need_drop > 0 and (b2 or r2):
        if len(b2) >= len(r2):
            if b2:
                b2 = b2[:-1]
                need_drop -= 1
            elif r2:
                r2 = r2[:-1]
                need_drop -= 1
            else:
                break
        else:
            if r2:
                r2 = r2[:-1]
                need_drop -= 1
            elif b2:
                b2 = b2[:-1]
                need_drop -= 1
            else:
                break
    return b2, r2, len(b) > len(b2), len(r) > len(r2)


_CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL = (
    "\n\n【评审边界——输入已为截断稿】\n"
    "以上仅为全文前 {n} 个字符，其后未提供给模型。"
    "对依赖长程多轮轨迹的细项（尤其 context_memory、interview_structure、跨轮重复盘问）"
    "必须保守给分（倾向区间中低），并在 insufficient_evidence 写明「输入为截断稿，长程证据不足」；"
    "不得臆断未展示轮次中的行为；confidence 须显著降低；禁止因未见问题而默认高分或推断后半段无缺陷。\n"
)

_TURN_PRIOR_TRUNCATION_TAIL = (
    "\n\n【评审边界——上文节选已截断】\n"
    "「截至上一轮」节选可能仅为更长对话的前 {n} 字；跨轮重复、长程结构若无法从节选核实，"
    "须在 insufficient_evidence 说明，并对相关细项保守给分。\n"
)

_COMPARE_STREAM_PAIR_TRUNCATION_NOTE = (
    "\n【评审边界】以下 A/B transcript 至少一侧为截断稿，请仅就**已展示片段**比较；"
    "不得断言未展示轮次的优劣；涉及跨轮重复盘问等须明确证据范围或说不足以判断。\n"
)


def conversation_judge_transcript_excerpt(full_transcript: str, cap: int) -> str:
    """整段评审：在 cap 内截断时在正文后附加边界说明，减少「假装看了全文」的幻觉打分。"""
    raw = (full_transcript or "").strip()
    c = max(0, int(cap))
    if len(raw) <= c:
        return raw
    return raw[:c] + _CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL.format(n=c)


def turn_judge_prior_excerpt(prior_transcript: str, cap: int) -> str:
    """逐轮评审里「截至上一轮」节选；截断时附加边界说明。"""
    raw = (prior_transcript or "").strip()
    c = max(0, int(cap))
    if len(raw) <= c:
        return raw
    return raw[:c] + _TURN_PRIOR_TRUNCATION_TAIL.format(n=c)


@dataclass(slots=True)
class JudgeCallResult(Generic[TJudgeOutput]):
    output: TJudgeOutput | None
    error: str | None = None


def _judge_error_message(e: LLMCallError) -> str:
    prefix = {
        "invoke": "模型调用失败",
        "decode": "JSON 解析失败",
        "validation": "结果校验失败",
    }.get(e.kind, "评审失败")
    detail = str(e).strip()
    return f"{prefix}: {detail}" if detail else prefix


def _build_memoir_judge_prompt(
    *,
    memoir_markdown: str,
    source_transcript: str = "",
    structured_evidence: str = "",
    reference_memoir_markdown: str = "",
    evidence_notes: str = "",
) -> str:
    """Assemble an evidence-aware memoir judging prompt."""
    source = (source_transcript or "").strip()
    struct = (structured_evidence or "").strip()
    reference = (reference_memoir_markdown or "").strip()
    notes = (evidence_notes or "").strip()
    sections = [
        MEMOIR_JUDGE_INSTRUCTIONS,
        "",
        "【证据与输入顺序】以下区块按优先级给出："
        "评审说明（若有）→ 原始访谈/对话证据（segment 绑定）→ 结构化记忆证据（chunk/fact/timeline/summary）"
        "→ 参考基线（若有）→ 待评成稿。**真实性、覆盖率、可追溯性以「artifact 绑定证据闭包」为准**；"
        "若证据不足，须保守打分并写 `insufficient_evidence`。",
        "",
    ]
    ev_cap = max(1, int(settings.eval_judge_memoir_evidence_max_chars))
    body_cap = max(1, int(settings.eval_judge_memoir_body_max_chars))
    if notes:
        sections.extend(["【评审说明】", notes[:1200], ""])
    if source:
        sections.extend(["【原始访谈/对话证据】", source[:ev_cap], ""])
    else:
        sections.extend(
            [
                "【原始访谈/对话证据】",
                "无可用局部对话证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性，必须保守打分，不得凭空高分。",
                "",
            ]
        )
    if struct:
        sections.extend(["【结构化记忆证据】", struct[:ev_cap], ""])
    else:
        sections.extend(
            [
                "【结构化记忆证据】",
                "（本 artifact 未绑定或未解析到 chunk/fact/timeline/summary 证据。）",
                "",
            ]
        )
    if reference:
        sections.extend(["【参考基线/导出成稿】", reference[:ev_cap], ""])
    sections.extend(["【当前回忆录正文】", memoir_markdown[:body_cap]])
    return "\n".join(sections)


class EvalJudgeService:
    def __init__(
        self,
        judge_llm: Any | None,
        *,
        context_window_tokens: int | None = None,
        http_error_vendor: EvalJudgeProvider = "deepseek",
    ) -> None:
        self._llm = judge_llm
        self._http_error_vendor: EvalJudgeProvider = http_error_vendor
        self._ctx_tokens = int(
            context_window_tokens or settings.eval_judge_context_window_tokens
        )

    def _conv_transcript_cap(self) -> int:
        return eval_judge_conversation_transcript_max_chars_for_context(
            self._ctx_tokens
        )

    def _turn_prior_cap(self) -> int:
        return eval_judge_turn_prior_transcript_max_chars_for_context(self._ctx_tokens)

    async def judge_turn(
        self,
        *,
        prior_transcript: str,
        user_utterance: str,
        assistant_reply: str,
        turn_index_0: int = 0,
    ) -> TurnJudgeOutput | None:
        if not self._llm:
            return None
        t = max(0, int(turn_index_0))
        prompt = f"""{TURN_JUDGE_INSTRUCTIONS}

【本轮位置】完整对话中当前轮次为 Turn {t + 1}（与下方节选及全量 transcript 的 `[Turn ...]` 编号一致）。evidence_refs.turn_index 请使用该编号。

【截至上一轮的对话节选】（含 `[Turn k]` 标签）
{turn_judge_prior_excerpt(prior_transcript, self._turn_prior_cap())}

【本轮用户】
{user_utterance[:4000]}

【本轮 AI】
{assistant_reply[:4000]}
"""
        try:
            return await allm_json_call(
                self._llm,
                prompt,
                TurnJudgeOutput,
                max_tokens=_TURN_MAX,
                agent="EvalJudgeService.judge_turn",
                http_error_vendor=self._http_error_vendor,
            )
        except LLMCallError as e:
            logger.warning("turn judge failed: {}", e)
            return None

    async def judge_conversation_result(
        self, *, full_transcript: str
    ) -> JudgeCallResult[ConversationJudgeOutput]:
        if not self._llm:
            return JudgeCallResult(
                output=None,
                error="评审模型未配置（智谱或 DeepSeek 密钥）",
            )
        prompt = f"""{CONV_JUDGE_INSTRUCTIONS}

【完整对话】（每轮以 `[Turn k]` 开头）
{conversation_judge_transcript_excerpt(full_transcript, self._conv_transcript_cap())}
"""
        try:
            out = await allm_json_call(
                self._llm,
                prompt,
                ConversationJudgeOutput,
                max_tokens=_CONV_JUDGE_JSON_MAX,
                agent="EvalJudgeService.judge_conversation",
                http_error_vendor=self._http_error_vendor,
            )
            return JudgeCallResult(output=out)
        except LLMCallError as e:
            error = _judge_error_message(e)
            logger.warning("conversation judge failed: {}", error)
            return JudgeCallResult(output=None, error=error)

    async def judge_conversation(
        self, *, full_transcript: str
    ) -> ConversationJudgeOutput | None:
        result = await self.judge_conversation_result(full_transcript=full_transcript)
        return result.output

    async def stream_conversation_compare(
        self,
        *,
        baseline_transcript: str,
        replay_transcript: str,
        baseline_judge: ConversationJudgeOutput | None,
        replay_judge: ConversationJudgeOutput | None,
    ) -> AsyncIterator[str]:
        """流式输出中文对比与建议（非 JSON）。"""
        if not self._llm:
            yield "[错误] 未配置评审模型 API Key（智谱：eval_judge_api_key / zhipu_api_key；DeepSeek：deepseek_api_key）"
            return
        cap_total, per_side = eval_judge_compare_bundle_caps(self._ctx_tokens)
        cap_single = self._conv_transcript_cap()
        b_tr, r_tr, b_cmp_trunc, r_cmp_trunc = trim_compare_transcript_pair(
            baseline_transcript or "",
            replay_transcript or "",
            total_max_chars=cap_total,
            per_side_max_chars=per_side,
        )
        compare_pair_truncated = b_cmp_trunc or r_cmp_trunc
        b_json = (
            baseline_judge.model_dump_json(ensure_ascii=False)
            if baseline_judge
            else "null"
        )
        r_json = (
            replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
        )
        if baseline_judge and replay_judge:
            trunc_line = (
                _COMPARE_STREAM_PAIR_TRUNCATION_NOTE if compare_pair_truncated else ""
            )
            prompt = f"""你是访谈对话评测专家。下面给出两份对话 transcript 及各自的整体打分（JSON）。请用中文直接写正文（不要用 JSON、不要用 Markdown 代码块）：

【A：导出基准对话】（历史快照：用户与当时导出的线上 AI，多轮合并为一篇）
{b_tr}

【B：本次回放/新测对话】（用户句与基准对齐，AI 为当前后端重新生成）
{r_tr}
{trunc_line}
【A 的整体评分 JSON】
{b_json}

【B 的整体评分 JSON】
{r_json}

请依次撰写：
1) 两段对话在整体体验上的主要差异（情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等）；
2) B 相对 A 的优点与不足；
3) 若 B 在关键维度明显弱于 A，给出可操作的改进方向（系统提示、访谈策略、模型或温度等）。

笔调简洁、偏执行清单。"""
        elif replay_judge:
            r_one = conversation_judge_transcript_excerpt(
                replay_transcript or "", cap_single
            )
            prompt = f"""{COMPARE_CONV_STREAM_HINT}

【回放/新测 transcript】
{r_one}

【整体评分 JSON】
{r_json}
"""
        else:
            yield "[错误] 缺少回放对话评分，无法生成建议"
            return

        llm = self._llm
        if hasattr(llm, "bind"):
            llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
        try:
            async for chunk in llm.astream(prompt):
                piece = getattr(chunk, "content", None)
                if piece:
                    yield piece
        except Exception as e:
            logger.warning("conversation compare stream failed: {}", e)
            yield f"\n\n[流式输出中断：{e}]"

    async def judge_memoir(
        self,
        *,
        memoir_markdown: str,
        source_transcript: str = "",
        structured_evidence: str = "",
        reference_memoir_markdown: str = "",
        evidence_notes: str = "",
    ) -> MemoirJudgeOutput | None:
        result = await self.judge_memoir_result(
            memoir_markdown=memoir_markdown,
            source_transcript=source_transcript,
            structured_evidence=structured_evidence,
            reference_memoir_markdown=reference_memoir_markdown,
            evidence_notes=evidence_notes,
        )
        return result.output

    async def judge_memoir_result(
        self,
        *,
        memoir_markdown: str,
        source_transcript: str = "",
        structured_evidence: str = "",
        reference_memoir_markdown: str = "",
        evidence_notes: str = "",
    ) -> JudgeCallResult[MemoirJudgeOutput]:
        if not self._llm:
            return JudgeCallResult(
                output=None,
                error="评审模型未配置（智谱或 DeepSeek 密钥）",
            )
        prompt = _build_memoir_judge_prompt(
            memoir_markdown=memoir_markdown,
            source_transcript=source_transcript,
            structured_evidence=structured_evidence,
            reference_memoir_markdown=reference_memoir_markdown,
            evidence_notes=evidence_notes,
        )
        try:
            out = await allm_json_call(
                self._llm,
                prompt,
                MemoirJudgeOutput,
                max_tokens=max(
                    512, int(settings.eval_judge_memoir_completion_max_tokens)
                ),
                agent="EvalJudgeService.judge_memoir",
                http_error_vendor=self._http_error_vendor,
            )
            return JudgeCallResult(output=out)
        except LLMCallError as e:
            error = _judge_error_message(e)
            # 回忆录评审在 INFO 也要可见（eval-web 排障）；非异常路径、不刷堆栈
            logger.info(
                "event=eval_memoir_judge_llm_call_failed agent=EvalJudgeService.judge_memoir msg={}",
                error,
            )
            return JudgeCallResult(output=None, error=error)
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								"""评测台评审：智谱 / DeepSeek 等 OpenAI 兼容端点（结构化 JSON）。"""
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								from __future__ import annotations
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								from collections.abc import AsyncIterator
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								from dataclasses import dataclass
 								from typing import Any, Generic, TypeVar
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								from app.core.config import settings
-												feat(api): DeepSeek V4 Flash 默认、HTTP 错讯与多供应商分层

- 主链路默认 deepseek-v4-flash，DEEPSEEK_THINKING_ENABLED 对齐旧非思考 chat
- 评测台评审装配迁入 adapters/llm（deepseek_eval_judge、zhipu_eval_judge）与 eval_judge_spec
- 拆分 llm_http_openai_chat_errors 与 llm_errors（DeepSeek/智谱品牌与文档链），llm_call 支持 http_error_vendor
- EvalJudgeService 按 spec.provider 传入 allm_json_call；评测台前端文案改为 V4 Flash
- 更新 .env 示例与 staging/production 的 DEEPSEEK_MODEL；补充 openai/供应商错讯测试

Made-with: Cursor

											
										
										
											2026-04-27 14:34:30 +08:00
+								from app.core.eval_judge_spec import EvalJudgeProvider
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								from app.core.llm_call import LLMCallError, allm_json_call
 								from app.core.logging import get_logger
 								from app.features.evaluation.judge_schemas import (
 								    ConversationJudgeOutput,
 								    MemoirJudgeOutput,
 								    TurnJudgeOutput,
 								)
 								from app.features.evaluation.rubrics.conversation_v1 import (
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								    COMPARE_CONV_STREAM_HINT,
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    CONV_JUDGE_INSTRUCTIONS,
 								    TURN_JUDGE_INSTRUCTIONS,
 								)
 								from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
 								logger = get_logger(__name__)
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								TJudgeOutput = TypeVar(
 								    "TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
 								)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								_TURN_MAX = 768
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								_CONV_JUDGE_JSON_MAX = 2048
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								_CONV_HEADER = "【完整对话】（每轮以 `[Turn k]` 开头）\n\n"
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								_COMPARE_STREAM_MAX = 6144
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								def _eval_judge_prompt_char_pool_for_context(context_window_tokens: int) -> int:
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								    """整段请求的字符预算（由评审模型 context window 推导，保守）。"""
 								    toks = (
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								        int(context_window_tokens)
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								        - settings.eval_judge_completion_reserve_tokens
 								        - settings.eval_judge_prompt_budget_safety_tokens
 								    )
 								    toks = max(1, toks)
 								    return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								def _eval_judge_prompt_char_pool() -> int:
 								    return _eval_judge_prompt_char_pool_for_context(
 								        settings.eval_judge_context_window_tokens
 								    )
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								def eval_judge_conversation_transcript_max_chars() -> int:
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								    """整段对话评审：【完整对话】transcript 最大字符数（默认 GLM 上下文）。"""
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								    if settings.eval_judge_max_transcript_chars > 0:
 								        return settings.eval_judge_max_transcript_chars
 								    overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
 								    return max(1, _eval_judge_prompt_char_pool() - overhead)
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								def eval_judge_conversation_transcript_max_chars_for_context(
 								    context_window_tokens: int,
 								) -> int:
 								    if settings.eval_judge_max_transcript_chars > 0:
 								        return settings.eval_judge_max_transcript_chars
 								    overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
 								    pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
 								    return max(1, pool - overhead)
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								def eval_judge_turn_prior_transcript_max_chars() -> int:
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								    """逐轮评审：截至上一轮的 transcript 节选上限（默认 GLM 上下文）。"""
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								    if settings.eval_judge_max_transcript_chars > 0:
 								        return settings.eval_judge_max_transcript_chars
 								    static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
 								    return max(1, _eval_judge_prompt_char_pool() - static)
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								def eval_judge_turn_prior_transcript_max_chars_for_context(
 								    context_window_tokens: int,
 								) -> int:
 								    if settings.eval_judge_max_transcript_chars > 0:
 								        return settings.eval_judge_max_transcript_chars
 								    static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
 								    pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
 								    return max(1, pool - static)
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								def eval_judge_compare_transcript_each_max_chars() -> int:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								    """单侧对称参考上限（默认与 settings.eval_judge_context_window_tokens 一致）。"""
 								    return eval_judge_compare_transcript_each_max_chars_for_context(
 								        settings.eval_judge_context_window_tokens
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								    )
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
 								def eval_judge_compare_transcript_pair_total_budget_for_context(
 								    context_window_tokens: int,
 								) -> int:
 								    """A/B 同 prompt 时，两份 transcript 合计最大字符数（已扣对比模板与双份 JSON 等开销）。"""
 								    if settings.eval_judge_max_compare_transcript_chars_each > 0:
 								        return max(1, 2 * int(settings.eval_judge_max_compare_transcript_chars_each))
 								    pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
 								    return max(1, pool - int(settings.eval_judge_compare_prompt_overhead_chars))
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
 								def eval_judge_compare_transcript_each_max_chars_for_context(
 								    context_window_tokens: int,
 								) -> int:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								    """单侧对称上限的参考值（auto 模式下约为合计预算的一半；供兼容与展示）。"""
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								    if settings.eval_judge_max_compare_transcript_chars_each > 0:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								        return int(settings.eval_judge_max_compare_transcript_chars_each)
 								    total = eval_judge_compare_transcript_pair_total_budget_for_context(
 								        context_window_tokens
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								    )
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								    return max(1, total // 2)
 								def eval_judge_compare_bundle_caps(
 								    context_window_tokens: int,
 								) -> tuple[int, int | None]:
 								    """返回 (compare_cap_total, per_side_cap|None)，供 Playground 摘要与流式对比共用。"""
 								    per = int(settings.eval_judge_max_compare_transcript_chars_each or 0)
 								    if per > 0:
 								        return max(1, 2 * per), per
 								    return eval_judge_compare_transcript_pair_total_budget_for_context(
 								        context_window_tokens
 								    ), None
 								def trim_compare_transcript_pair(
 								    baseline: str,
 								    replay: str,
 								    *,
 								    total_max_chars: int,
 								    per_side_max_chars: int | None = None,
 								) -> tuple[str, str, bool, bool]:
 								    """A/B 对比 prompt 用：在合计预算内尽量保留全文；仅超长时优先从较长的一侧裁尾部。
 								    若配置了 eval_judge_max_compare_transcript_chars_each，则仍按单侧硬顶（与旧行为一致）。
 								    """
 								    b = (baseline or "").strip()
 								    r = (replay or "").strip()
 								    if per_side_max_chars is not None and int(per_side_max_chars) > 0:
 								        cap = int(per_side_max_chars)
 								        return b[:cap], r[:cap], len(b) > cap, len(r) > cap
 								    cap_total = max(1, int(total_max_chars))
 								    if len(b) + len(r) <= cap_total:
 								        return b, r, False, False
 								    need_drop = len(b) + len(r) - cap_total
 								    b2, r2 = b, r
 								    while need_drop > 0 and (b2 or r2):
 								        if len(b2) >= len(r2):
 								            if b2:
 								                b2 = b2[:-1]
 								                need_drop -= 1
 								            elif r2:
 								                r2 = r2[:-1]
 								                need_drop -= 1
 								            else:
 								                break
 								        else:
 								            if r2:
 								                r2 = r2[:-1]
 								                need_drop -= 1
 								            elif b2:
 								                b2 = b2[:-1]
 								                need_drop -= 1
 								            else:
 								                break
 								    return b2, r2, len(b) > len(b2), len(r) > len(r2)
 								_CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL = (
 								    "\n\n【评审边界——输入已为截断稿】\n"
 								    "以上仅为全文前 {n} 个字符，其后未提供给模型。"
 								    "对依赖长程多轮轨迹的细项（尤其 context_memory、interview_structure、跨轮重复盘问）"
 								    "必须保守给分（倾向区间中低），并在 insufficient_evidence 写明「输入为截断稿，长程证据不足」；"
 								    "不得臆断未展示轮次中的行为；confidence 须显著降低；禁止因未见问题而默认高分或推断后半段无缺陷。\n"
 								)
 								_TURN_PRIOR_TRUNCATION_TAIL = (
 								    "\n\n【评审边界——上文节选已截断】\n"
 								    "「截至上一轮」节选可能仅为更长对话的前 {n} 字；跨轮重复、长程结构若无法从节选核实，"
 								    "须在 insufficient_evidence 说明，并对相关细项保守给分。\n"
 								)
 								_COMPARE_STREAM_PAIR_TRUNCATION_NOTE = (
 								    "\n【评审边界】以下 A/B transcript 至少一侧为截断稿，请仅就**已展示片段**比较；"
 								    "不得断言未展示轮次的优劣；涉及跨轮重复盘问等须明确证据范围或说不足以判断。\n"
 								)
 								def conversation_judge_transcript_excerpt(full_transcript: str, cap: int) -> str:
 								    """整段评审：在 cap 内截断时在正文后附加边界说明，减少「假装看了全文」的幻觉打分。"""
 								    raw = (full_transcript or "").strip()
 								    c = max(0, int(cap))
 								    if len(raw) <= c:
 								        return raw
 								    return raw[:c] + _CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL.format(n=c)
 								def turn_judge_prior_excerpt(prior_transcript: str, cap: int) -> str:
 								    """逐轮评审里「截至上一轮」节选；截断时附加边界说明。"""
 								    raw = (prior_transcript or "").strip()
 								    c = max(0, int(cap))
 								    if len(raw) <= c:
 								        return raw
 								    return raw[:c] + _TURN_PRIOR_TRUNCATION_TAIL.format(n=c)
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								@dataclass(slots=True)
 								class JudgeCallResult(Generic[TJudgeOutput]):
 								    output: TJudgeOutput | None
 								    error: str | None = None
 								def _judge_error_message(e: LLMCallError) -> str:
 								    prefix = {
 								        "invoke": "模型调用失败",
 								        "decode": "JSON 解析失败",
 								        "validation": "结果校验失败",
 								    }.get(e.kind, "评审失败")
 								    detail = str(e).strip()
 								    return f"{prefix}: {detail}" if detail else prefix
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								def _build_memoir_judge_prompt(
 								    *,
 								    memoir_markdown: str,
 								    source_transcript: str = "",
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    structured_evidence: str = "",
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    reference_memoir_markdown: str = "",
 								    evidence_notes: str = "",
 								) -> str:
 								    """Assemble an evidence-aware memoir judging prompt."""
 								    source = (source_transcript or "").strip()
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    struct = (structured_evidence or "").strip()
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    reference = (reference_memoir_markdown or "").strip()
 								    notes = (evidence_notes or "").strip()
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    sections = [
 								        MEMOIR_JUDGE_INSTRUCTIONS,
 								        "",
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								        "【证据与输入顺序】以下区块按优先级给出："
 								        "评审说明（若有）→ 原始访谈/对话证据（segment 绑定）→ 结构化记忆证据（chunk/fact/timeline/summary）"
 								        "→ 参考基线（若有）→ 待评成稿。**真实性、覆盖率、可追溯性以「artifact 绑定证据闭包」为准**；"
-												feat(api)!: memory single chain — async MemoryService, strict eval closure

Route all memory ingest/retrieve/enrichment/compaction through async MemoryService.
Remove legacy sync memory implementations (ingest/retrieve/compaction); Celery and
memoir Phase2 call asyncio.run into MemoryService-backed helpers.

Memoir Phase1 batch ingest uses MemoryService.ingest_transcripts_batch; drop chapters.
evidence_bundle_json mirror (Alembic 0015). Evaluation uses snapshot/link-only bundles;
raise EvidenceClosureMissing instead of partial/fallback lineage tiers.

Split memoir state into NarrativeCoverageState and InterviewControlState; delete the
_interview_meta_store adapter layer. Remove rolling-query and recent-fact fallback
settings from config and evidence assembly.

Update judges, docs, tests, and PlaygroundPage alignment.

Made-with: Cursor

											
										
										
											2026-04-30 14:11:46 +08:00
+								        "若证据不足，须保守打分并写 `insufficient_evidence`。",
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        "",
 								    ]
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								    ev_cap = max(1, int(settings.eval_judge_memoir_evidence_max_chars))
 								    body_cap = max(1, int(settings.eval_judge_memoir_body_max_chars))
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    if notes:
 								        sections.extend(["【评审说明】", notes[:1200], ""])
 								    if source:
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								        sections.extend(["【原始访谈/对话证据】", source[:ev_cap], ""])
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    else:
 								        sections.extend(
 								            [
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								                "【原始访谈/对话证据】",
 								                "无可用局部对话证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性，必须保守打分，不得凭空高分。",
 								                "",
 								            ]
 								        )
 								    if struct:
-												fix(eval): use ev_cap for structured evidence slice in memoir judge prompt

_merge left an undefined _MEMOIR_EVIDENCE_MAX; align with evidence budget from settings.

											
										
										
											2026-04-10 10:28:41 +08:00
+								        sections.extend(["【结构化记忆证据】", struct[:ev_cap], ""])
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								    else:
 								        sections.extend(
 								            [
 								                "【结构化记忆证据】",
 								                "（本 artifact 未绑定或未解析到 chunk/fact/timeline/summary 证据。）",
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								                "",
 								            ]
 								        )
 								    if reference:
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								        sections.extend(["【参考基线/导出成稿】", reference[:ev_cap], ""])
 								    sections.extend(["【当前回忆录正文】", memoir_markdown[:body_cap]])
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    return "\n".join(sections)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								class EvalJudgeService:
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								    def __init__(
 								        self,
 								        judge_llm: Any | None,
 								        *,
 								        context_window_tokens: int | None = None,
-												feat(api): DeepSeek V4 Flash 默认、HTTP 错讯与多供应商分层

- 主链路默认 deepseek-v4-flash，DEEPSEEK_THINKING_ENABLED 对齐旧非思考 chat
- 评测台评审装配迁入 adapters/llm（deepseek_eval_judge、zhipu_eval_judge）与 eval_judge_spec
- 拆分 llm_http_openai_chat_errors 与 llm_errors（DeepSeek/智谱品牌与文档链），llm_call 支持 http_error_vendor
- EvalJudgeService 按 spec.provider 传入 allm_json_call；评测台前端文案改为 V4 Flash
- 更新 .env 示例与 staging/production 的 DEEPSEEK_MODEL；补充 openai/供应商错讯测试

Made-with: Cursor

											
										
										
											2026-04-27 14:34:30 +08:00
+								        http_error_vendor: EvalJudgeProvider = "deepseek",
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								    ) -> None:
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        self._llm = judge_llm
-												feat(api): DeepSeek V4 Flash 默认、HTTP 错讯与多供应商分层

- 主链路默认 deepseek-v4-flash，DEEPSEEK_THINKING_ENABLED 对齐旧非思考 chat
- 评测台评审装配迁入 adapters/llm（deepseek_eval_judge、zhipu_eval_judge）与 eval_judge_spec
- 拆分 llm_http_openai_chat_errors 与 llm_errors（DeepSeek/智谱品牌与文档链），llm_call 支持 http_error_vendor
- EvalJudgeService 按 spec.provider 传入 allm_json_call；评测台前端文案改为 V4 Flash
- 更新 .env 示例与 staging/production 的 DEEPSEEK_MODEL；补充 openai/供应商错讯测试

Made-with: Cursor

											
										
										
											2026-04-27 14:34:30 +08:00
+								        self._http_error_vendor: EvalJudgeProvider = http_error_vendor
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								        self._ctx_tokens = int(
 								            context_window_tokens or settings.eval_judge_context_window_tokens
 								        )
 								    def _conv_transcript_cap(self) -> int:
 								        return eval_judge_conversation_transcript_max_chars_for_context(
 								            self._ctx_tokens
 								        )
 								    def _turn_prior_cap(self) -> int:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								        return eval_judge_turn_prior_transcript_max_chars_for_context(self._ctx_tokens)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								    async def judge_turn(
 								        self,
 								        *,
 								        prior_transcript: str,
 								        user_utterance: str,
 								        assistant_reply: str,
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        turn_index_0: int = 0,
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								    ) -> TurnJudgeOutput | None:
 								        if not self._llm:
 								            return None
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        t = max(0, int(turn_index_0))
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								【本轮位置】完整对话中当前轮次为 Turn {t + 1}（与下方节选及全量 transcript 的 `[Turn ...]` 编号一致）。evidence_refs.turn_index 请使用该编号。
 								【截至上一轮的对话节选】（含 `[Turn k]` 标签）
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								{turn_judge_prior_excerpt(prior_transcript, self._turn_prior_cap())}
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
 								【本轮用户】
 								{user_utterance[:4000]}
 								【本轮 AI】
 								{assistant_reply[:4000]}
 								"""
 								        try:
 								            return await allm_json_call(
 								                self._llm,
 								                prompt,
 								                TurnJudgeOutput,
 								                max_tokens=_TURN_MAX,
 								                agent="EvalJudgeService.judge_turn",
-												feat(api): DeepSeek V4 Flash 默认、HTTP 错讯与多供应商分层

- 主链路默认 deepseek-v4-flash，DEEPSEEK_THINKING_ENABLED 对齐旧非思考 chat
- 评测台评审装配迁入 adapters/llm（deepseek_eval_judge、zhipu_eval_judge）与 eval_judge_spec
- 拆分 llm_http_openai_chat_errors 与 llm_errors（DeepSeek/智谱品牌与文档链），llm_call 支持 http_error_vendor
- EvalJudgeService 按 spec.provider 传入 allm_json_call；评测台前端文案改为 V4 Flash
- 更新 .env 示例与 staging/production 的 DEEPSEEK_MODEL；补充 openai/供应商错讯测试

Made-with: Cursor

											
										
										
											2026-04-27 14:34:30 +08:00
+								                http_error_vendor=self._http_error_vendor,
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								            )
 								        except LLMCallError as e:
 								            logger.warning("turn judge failed: {}", e)
 								            return None
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    async def judge_conversation_result(
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        self, *, full_transcript: str
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								    ) -> JudgeCallResult[ConversationJudgeOutput]:
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        if not self._llm:
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								            return JudgeCallResult(
 								                output=None,
 								                error="评审模型未配置（智谱或 DeepSeek 密钥）",
 								            )
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								【完整对话】（每轮以 `[Turn k]` 开头）
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								{conversation_judge_transcript_excerpt(full_transcript, self._conv_transcript_cap())}
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								"""
 								        try:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            out = await allm_json_call(
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								                self._llm,
 								                prompt,
 								                ConversationJudgeOutput,
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								                max_tokens=_CONV_JUDGE_JSON_MAX,
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								                agent="EvalJudgeService.judge_conversation",
-												feat(api): DeepSeek V4 Flash 默认、HTTP 错讯与多供应商分层

- 主链路默认 deepseek-v4-flash，DEEPSEEK_THINKING_ENABLED 对齐旧非思考 chat
- 评测台评审装配迁入 adapters/llm（deepseek_eval_judge、zhipu_eval_judge）与 eval_judge_spec
- 拆分 llm_http_openai_chat_errors 与 llm_errors（DeepSeek/智谱品牌与文档链），llm_call 支持 http_error_vendor
- EvalJudgeService 按 spec.provider 传入 allm_json_call；评测台前端文案改为 V4 Flash
- 更新 .env 示例与 staging/production 的 DEEPSEEK_MODEL；补充 openai/供应商错讯测试

Made-with: Cursor

											
										
										
											2026-04-27 14:34:30 +08:00
+								                http_error_vendor=self._http_error_vendor,
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								            )
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            return JudgeCallResult(output=out)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        except LLMCallError as e:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            error = _judge_error_message(e)
 								            logger.warning("conversation judge failed: {}", error)
 								            return JudgeCallResult(output=None, error=error)
 								    async def judge_conversation(
 								        self, *, full_transcript: str
 								    ) -> ConversationJudgeOutput | None:
 								        result = await self.judge_conversation_result(full_transcript=full_transcript)
 								        return result.output
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								    async def stream_conversation_compare(
 								        self,
 								        *,
 								        baseline_transcript: str,
 								        replay_transcript: str,
 								        baseline_judge: ConversationJudgeOutput | None,
 								        replay_judge: ConversationJudgeOutput | None,
 								    ) -> AsyncIterator[str]:
 								        """流式输出中文对比与建议（非 JSON）。"""
 								        if not self._llm:
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								            yield "[错误] 未配置评审模型 API Key（智谱：eval_judge_api_key / zhipu_api_key；DeepSeek：deepseek_api_key）"
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								            return
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								        cap_total, per_side = eval_judge_compare_bundle_caps(self._ctx_tokens)
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								        cap_single = self._conv_transcript_cap()
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								        b_tr, r_tr, b_cmp_trunc, r_cmp_trunc = trim_compare_transcript_pair(
 								            baseline_transcript or "",
 								            replay_transcript or "",
 								            total_max_chars=cap_total,
 								            per_side_max_chars=per_side,
 								        )
 								        compare_pair_truncated = b_cmp_trunc or r_cmp_trunc
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								        b_json = (
 								            baseline_judge.model_dump_json(ensure_ascii=False)
 								            if baseline_judge
 								            else "null"
 								        )
 								        r_json = (
 								            replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
 								        )
 								        if baseline_judge and replay_judge:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								            trunc_line = (
 								                _COMPARE_STREAM_PAIR_TRUNCATION_NOTE if compare_pair_truncated else ""
 								            )
 								            prompt = f"""你是访谈对话评测专家。下面给出两份对话 transcript 及各自的整体打分（JSON）。请用中文直接写正文（不要用 JSON、不要用 Markdown 代码块）：
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
 								【A：导出基准对话】（历史快照：用户与当时导出的线上 AI，多轮合并为一篇）
 								{b_tr}
 								【B：本次回放/新测对话】（用户句与基准对齐，AI 为当前后端重新生成）
 								{r_tr}
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								{trunc_line}
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								【A 的整体评分 JSON】
 								{b_json}
 								【B 的整体评分 JSON】
 								{r_json}
 								请依次撰写：
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+) 两段对话在整体体验上的主要差异（情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等）；
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+) B 相对 A 的优点与不足；
 ) 若 B 在关键维度明显弱于 A，给出可操作的改进方向（系统提示、访谈策略、模型或温度等）。
 								笔调简洁、偏执行清单。"""
 								        elif replay_judge:
-												feat:
1. 建立问题库大纲，对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性

											
										
										
											2026-04-09 15:32:35 +08:00
+								            r_one = conversation_judge_transcript_excerpt(
 								                replay_transcript or "", cap_single
 								            )
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
+								            prompt = f"""{COMPARE_CONV_STREAM_HINT}
 								【回放/新测 transcript】
-												feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.

											
										
										
											2026-04-08 09:38:07 +08:00
+								{r_one}
-												feat/ eval

											
										
										
											2026-04-06 23:19:20 +08:00
 								【整体评分 JSON】
 								{r_json}
 								"""
 								        else:
 								            yield "[错误] 缺少回放对话评分，无法生成建议"
 								            return
 								        llm = self._llm
 								        if hasattr(llm, "bind"):
 								            llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
 								        try:
 								            async for chunk in llm.astream(prompt):
 								                piece = getattr(chunk, "content", None)
 								                if piece:
 								                    yield piece
 								        except Exception as e:
 								            logger.warning("conversation compare stream failed: {}", e)
 								            yield f"\n\n[流式输出中断：{e}]"
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								    async def judge_memoir(
 								        self,
 								        *,
 								        memoir_markdown: str,
 								        source_transcript: str = "",
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								        structured_evidence: str = "",
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								        reference_memoir_markdown: str = "",
 								        evidence_notes: str = "",
 								    ) -> MemoirJudgeOutput | None:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        result = await self.judge_memoir_result(
 								            memoir_markdown=memoir_markdown,
 								            source_transcript=source_transcript,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								            structured_evidence=structured_evidence,
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            reference_memoir_markdown=reference_memoir_markdown,
 								            evidence_notes=evidence_notes,
 								        )
 								        return result.output
 								    async def judge_memoir_result(
 								        self,
 								        *,
 								        memoir_markdown: str,
 								        source_transcript: str = "",
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								        structured_evidence: str = "",
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								        reference_memoir_markdown: str = "",
 								        evidence_notes: str = "",
 								    ) -> JudgeCallResult[MemoirJudgeOutput]:
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        if not self._llm:
-												refactor(eval+memoir)：精简内部评测路由与服务，composite/对话摘要与 judge 能力补强

- 访谈：新增 interview_state_hints，联动 orchestrator 与提示词
- 回忆录：story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建：开发用 celery broker、compose/development 脚本、依赖注入
- eval-web：移除数据集/实验/版本等页面与流式轮询，突出 Playground
- 文档与单测同步

											
										
										
											2026-04-08 21:36:12 +08:00
+								            return JudgeCallResult(
 								                output=None,
 								                error="评审模型未配置（智谱或 DeepSeek 密钥）",
 								            )
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								        prompt = _build_memoir_judge_prompt(
 								            memoir_markdown=memoir_markdown,
 								            source_transcript=source_transcript,
-												feat: 回忆录证据血缘与内部评测可追溯，顺带对齐本地评测台与 CI

数据库与模型：新增多版迁移（章节证据快照、对话血缘、记忆事实/时间线 lineage 等），把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路：会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照；新增章节证据快照与评测侧 EvalTraceService 等模块，方便组评审用的证据包。
内部评测：自动化 run 与手工 memoir 评审共用可追溯证据；rubric/ judge 相关脚本与文档有配套调整。
app-eval-web：Memoir/实验详情里能展开看证据摘要与 evidence_trace（含对话轮次 id）；Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致，避免改端口后页面连错服务。
工程杂项：GitHub Actions / 仓库说明有更新；各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾；新增/扩充了?

											
										
										
											2026-04-08 15:37:09 +08:00
+								            structured_evidence=structured_evidence,
-												feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分，去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线；无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据（会话/用户聚合、截断）
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD；移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试

											
										
										
											2026-04-07 10:34:59 +08:00
+								            reference_memoir_markdown=reference_memoir_markdown,
 								            evidence_notes=evidence_notes,
 								        )
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        try:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            out = await allm_json_call(
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								                self._llm,
 								                prompt,
 								                MemoirJudgeOutput,
-												fix:
1. 修复登录界面文字被遮挡问题
2. 大字模式关闭后显示异常问题
3. 重新调整大字模式是否开启时的字体显示效果

											
										
										
											2026-04-10 20:35:57 +08:00
+								                max_tokens=max(
 , int(settings.eval_judge_memoir_completion_max_tokens)
 								                ),
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								                agent="EvalJudgeService.judge_memoir",
-												feat(api): DeepSeek V4 Flash 默认、HTTP 错讯与多供应商分层

- 主链路默认 deepseek-v4-flash，DEEPSEEK_THINKING_ENABLED 对齐旧非思考 chat
- 评测台评审装配迁入 adapters/llm（deepseek_eval_judge、zhipu_eval_judge）与 eval_judge_spec
- 拆分 llm_http_openai_chat_errors 与 llm_errors（DeepSeek/智谱品牌与文档链），llm_call 支持 http_error_vendor
- EvalJudgeService 按 spec.provider 传入 allm_json_call；评测台前端文案改为 V4 Flash
- 更新 .env 示例与 staging/production 的 DEEPSEEK_MODEL；补充 openai/供应商错讯测试

Made-with: Cursor

											
										
										
											2026-04-27 14:34:30 +08:00
+								                http_error_vendor=self._http_error_vendor,
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								            )
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            return JudgeCallResult(output=out)
-												feat/ 导出开发容器内的数据用于评估

											
										
										
											2026-04-03 14:44:46 +08:00
+								        except LLMCallError as e:
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            error = _judge_error_message(e)
-												feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.

											
										
										
											2026-04-10 10:23:43 +08:00
+								            # 回忆录评审在 INFO 也要可见（eval-web 排障）；非异常路径、不刷堆栈
 								            logger.info(
 								                "event=eval_memoir_judge_llm_call_failed agent=EvalJudgeService.judge_memoir msg={}",
 								                error,
 								            )
-												feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.

											
										
										
											2026-04-07 17:15:01 +08:00
+								            return JudgeCallResult(output=None, error=error)