diff --git a/api/alembic/env.py b/api/alembic/env.py index bfbdb76..b5772dd 100644 --- a/api/alembic/env.py +++ b/api/alembic/env.py @@ -6,22 +6,23 @@ import os import sys from logging.config import fileConfig -from alembic import context from sqlalchemy import create_engine, pool +from alembic import context + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) -from app.core.config import settings from app.core.db import Base, _database_url # 聚合注册所有 feature 的 model 到 Base.metadata from app.features.asset import models as _asset_models # noqa: F401 from app.features.auth import models as _auth_models # noqa: F401 from app.features.conversation import models as _conv_models # noqa: F401 -from app.features.memory import models as _memory_models # noqa: F401 +from app.features.evaluation import models as _eval_models # noqa: F401 from app.features.memoir import models as _memoir_models # noqa: F401 -from app.features.story import models as _story_models # noqa: F401 +from app.features.memory import models as _memory_models # noqa: F401 from app.features.payment import models as _payment_models # noqa: F401 +from app.features.story import models as _story_models # noqa: F401 from app.features.user import models as _user_models # noqa: F401 config = context.config diff --git a/api/alembic/versions/0008_eval_regression_platform.py b/api/alembic/versions/0008_eval_regression_platform.py new file mode 100644 index 0000000..ff040aa --- /dev/null +++ b/api/alembic/versions/0008_eval_regression_platform.py @@ -0,0 +1,230 @@ +"""内部回归评测平台表(eval_*) + +Revision ID: 0008_eval_regression_platform +Revises: 0007_drop_chunk_content_tsv +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +revision: str = "0008_eval_regression_platform" +down_revision: Union[str, None] = "0007_drop_chunk_content_tsv" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "eval_regression_sets", + sa.Column("id", sa.String(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "eval_versions", + sa.Column("id", sa.String(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.Column("runner_kind", sa.String(), nullable=False), + sa.Column( + "config_json", postgresql.JSONB(astext_type=sa.Text()), nullable=True + ), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "eval_cases", + sa.Column("id", sa.String(), nullable=False), + sa.Column("regression_set_id", sa.String(), nullable=False), + sa.Column("source_conversation_id", sa.String(), nullable=True), + sa.Column("source_user_id", sa.String(), nullable=True), + sa.Column("title", sa.String(), nullable=True), + sa.Column( + "user_utterances", + postgresql.JSONB(astext_type=sa.Text()), + nullable=False, + ), + sa.Column("reference_memoir_markdown", sa.Text(), nullable=True), + sa.Column( + "is_protected", + sa.Boolean(), + server_default=sa.text("false"), + nullable=False, + ), + sa.Column("meta", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["regression_set_id"], + ["eval_regression_sets.id"], + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + "ix_eval_cases_source_conversation_id", + "eval_cases", + ["source_conversation_id"], + unique=False, + ) + op.create_index( + "ix_eval_cases_source_user_id", + "eval_cases", + ["source_user_id"], + unique=False, + ) + op.create_table( + "eval_experiments", + sa.Column("id", sa.String(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.Column("regression_set_id", sa.String(), nullable=False), + sa.Column("baseline_version_id", sa.String(), nullable=False), + sa.Column("candidate_version_id", sa.String(), nullable=False), + sa.Column("rubric_pack", sa.String(), nullable=False), + sa.Column( + "composite_weights_json", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + sa.Column("status", sa.String(), nullable=False), + sa.Column("error_message", sa.Text(), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint( + ["baseline_version_id"], + ["eval_versions.id"], + ), + sa.ForeignKeyConstraint( + ["candidate_version_id"], + ["eval_versions.id"], + ), + sa.ForeignKeyConstraint( + ["regression_set_id"], + ["eval_regression_sets.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "eval_runs", + sa.Column("id", sa.String(), nullable=False), + sa.Column("experiment_id", sa.String(), nullable=False), + sa.Column("case_id", sa.String(), nullable=False), + sa.Column("side", sa.String(), nullable=False), + sa.Column("status", sa.String(), nullable=False), + sa.Column("error_message", sa.Text(), nullable=True), + sa.Column("memoir_markdown", sa.Text(), nullable=True), + sa.Column("conversation_score_total", sa.Float(), nullable=True), + sa.Column("memoir_score_total", sa.Float(), nullable=True), + sa.Column("composite_score", sa.Float(), nullable=True), + sa.Column( + "judge_bundle_json", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + sa.Column("started_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint( + ["case_id"], + ["eval_cases.id"], + ), + sa.ForeignKeyConstraint( + ["experiment_id"], + ["eval_experiments.id"], + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint( + "experiment_id", + "case_id", + "side", + name="uq_eval_run_experiment_case_side", + ), + ) + op.create_table( + "eval_run_turns", + sa.Column("id", sa.String(), nullable=False), + sa.Column("run_id", sa.String(), nullable=False), + sa.Column("turn_index", sa.Integer(), nullable=False), + sa.Column("user_utterance", sa.Text(), nullable=False), + sa.Column("assistant_reply", sa.Text(), nullable=True), + sa.Column("duration_ms", sa.Integer(), nullable=True), + sa.Column( + "judge_scores_json", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + sa.Column("judge_rationale", sa.Text(), nullable=True), + sa.ForeignKeyConstraint( + ["run_id"], + ["eval_runs.id"], + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("run_id", "turn_index", name="uq_eval_run_turn_index"), + ) + op.create_table( + "eval_gate_verdicts", + sa.Column("id", sa.String(), nullable=False), + sa.Column("experiment_id", sa.String(), nullable=False), + sa.Column("passed", sa.Boolean(), nullable=False), + sa.Column("mean_composite_delta", sa.Float(), nullable=True), + sa.Column( + "protected_regressions_json", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + sa.Column( + "details_json", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + sa.Column( + "computed_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["experiment_id"], + ["eval_experiments.id"], + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("experiment_id"), + ) + + +def downgrade() -> None: + op.drop_table("eval_gate_verdicts") + op.drop_table("eval_run_turns") + op.drop_table("eval_runs") + op.drop_table("eval_experiments") + op.drop_index("ix_eval_cases_source_user_id", table_name="eval_cases") + op.drop_index("ix_eval_cases_source_conversation_id", table_name="eval_cases") + op.drop_table("eval_cases") + op.drop_table("eval_versions") + op.drop_table("eval_regression_sets") diff --git a/api/app/core/config.py b/api/app/core/config.py index 51a20a5..be53825 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -335,5 +335,24 @@ class Settings(BaseSettings): tencent_cos_base_url: str = "" tencent_cos_token: str = "" + # ── Internal regression evaluation lab(独立入口,不挂在消费者 API)──── + internal_eval_api_key: str = "" + internal_eval_enable_docs: bool = False + # 逗号分隔;空则内部 API 不额外限制 Origin(仍可依赖 internal_eval_api_key) + internal_eval_cors_origins: str = "" + # GLM / 智谱:评审模型(OpenAI 兼容 Chat Completions,与 langchain-openai 一致) + eval_judge_api_key: str = "" + eval_judge_base_url: str = "https://open.bigmodel.cn/api/paas/v4" + eval_judge_model: str = "glm-4-flash" + eval_judge_temperature: float = 0.3 + # 候选对话回放:与生产访谈类似的温度 + eval_candidate_temperature: float = 0.7 + # 门禁:受保护 session 合成份数下跌超过该阈值视为回归(0–100 分制) + eval_gate_protected_regression_threshold: float = Field( + default=2.0, ge=0.0, le=100.0 + ) + # 执行 LLM 判分与回放(Celery 未跑时可关,仅跑结构/导入) + eval_execution_enabled: bool = True + settings = Settings() diff --git a/api/app/core/dependencies.py b/api/app/core/dependencies.py index b3ececd..1ccb837 100644 --- a/api/app/core/dependencies.py +++ b/api/app/core/dependencies.py @@ -185,6 +185,25 @@ async def get_current_user( return user +def get_eval_judge_langchain_llm(): + """智谱 GLM(OpenAI 兼容)用于评审 JSON;与访谈生产模型分离。""" + from langchain_openai import ChatOpenAI + + api_key = (settings.eval_judge_api_key or settings.zhipu_api_key or "").strip() + if not api_key: + return None + base = (settings.eval_judge_base_url or "").rstrip("/") + for suffix in ("/v1/chat/completions", "/v1"): + if base.endswith(suffix): + base = base[: -len(suffix)] + return ChatOpenAI( + api_key=api_key, + base_url=base or "https://open.bigmodel.cn/api/paas/v4", + model=settings.eval_judge_model or "glm-4-flash", + temperature=settings.eval_judge_temperature, + ) + + async def get_optional_user( token: Optional[str] = Depends(oauth2_scheme), db: AsyncSession = Depends(get_async_db), diff --git a/api/app/features/evaluation/__init__.py b/api/app/features/evaluation/__init__.py new file mode 100644 index 0000000..fcd20ca --- /dev/null +++ b/api/app/features/evaluation/__init__.py @@ -0,0 +1 @@ +"""Internal regression evaluation lab — isolated from consumer conversation routes.""" diff --git a/api/app/features/evaluation/admin_service.py b/api/app/features/evaluation/admin_service.py new file mode 100644 index 0000000..a269de7 --- /dev/null +++ b/api/app/features/evaluation/admin_service.py @@ -0,0 +1,252 @@ +"""内部评测 REST 编排:事务与业务规则;数据访问经 repo。""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.features.evaluation import repo as eval_repo +from app.features.evaluation.errors import ( + EvaluationBadRequestError, + EvaluationNotFoundError, +) +from app.features.evaluation.importers.script_json import parse_script_json +from app.features.evaluation.importers.user_export_markdown import ( + extract_user_utterances_from_export_md, +) +from app.features.evaluation.models import ( + EvalCase, + EvalExperiment, + EvalGateVerdict, + EvalRegressionSet, + EvalRun, + EvalRunTurn, + EvalVersion, +) +from app.features.evaluation.schemas import ( + CaseCreate, + ExperimentCreate, + ImportJsonCaseBody, + ImportMarkdownBody, + RegressionSetCreate, + SnapshotFromConversationBody, + VersionCreate, +) +from app.features.evaluation.session_catalog_service import SessionCatalogService +from app.tasks.evaluation_tasks import run_eval_experiment_task + + +@dataclass(frozen=True) +class ExperimentDetailBundle: + experiment: EvalExperiment + run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] + gate: EvalGateVerdict | None + + +class EvaluationAdminService: + def __init__(self, db: AsyncSession) -> None: + self._db = db + + async def list_regression_sets(self) -> list[EvalRegressionSet]: + return await eval_repo.list_regression_sets(self._db) + + async def create_regression_set( + self, body: RegressionSetCreate + ) -> EvalRegressionSet: + row = await eval_repo.create_regression_set( + self._db, name=body.name, description=body.description + ) + await self._db.commit() + await self._db.refresh(row) + return row + + async def list_cases(self, set_id: str) -> list[EvalCase]: + parent = await eval_repo.get_regression_set(self._db, set_id) + if not parent: + raise EvaluationNotFoundError("regression set not found") + return await eval_repo.list_cases(self._db, set_id) + + async def create_case(self, set_id: str, body: CaseCreate) -> EvalCase: + parent = await eval_repo.get_regression_set(self._db, set_id) + if not parent: + raise EvaluationNotFoundError("regression set not found") + row = await eval_repo.create_case( + self._db, + regression_set_id=set_id, + user_utterances=body.user_utterances, + title=body.title, + source_conversation_id=body.source_conversation_id, + source_user_id=body.source_user_id, + reference_memoir_markdown=body.reference_memoir_markdown, + is_protected=body.is_protected, + meta=body.meta, + ) + await self._db.commit() + await self._db.refresh(row) + return row + + async def snapshot_from_conversation( + self, + set_id: str, + conversation_id: str, + body: SnapshotFromConversationBody, + ) -> EvalCase: + parent = await eval_repo.get_regression_set(self._db, set_id) + if not parent: + raise EvaluationNotFoundError("regression set not found") + catalog = SessionCatalogService(self._db) + tr = await catalog.get_transcript(conversation_id) + if not tr: + raise EvaluationNotFoundError("conversation not found") + utterances = ( + tr.user_utterances_from_messages + if body.use_messages + else tr.user_utterances_from_segments + ) + if not utterances: + raise EvaluationBadRequestError("no user utterances in session") + row = await eval_repo.create_case( + self._db, + regression_set_id=set_id, + user_utterances=utterances, + title=body.title, + source_conversation_id=conversation_id, + source_user_id=tr.user_id, + is_protected=body.is_protected, + meta={"source": "conversation_snapshot", "use_messages": body.use_messages}, + ) + await self._db.commit() + await self._db.refresh(row) + return row + + async def import_markdown_case( + self, set_id: str, body: ImportMarkdownBody + ) -> EvalCase: + parent = await eval_repo.get_regression_set(self._db, set_id) + if not parent: + raise EvaluationNotFoundError("regression set not found") + utterances = extract_user_utterances_from_export_md(body.markdown) + if not utterances: + raise EvaluationBadRequestError("no user lines parsed from markdown") + row = await eval_repo.create_case( + self._db, + regression_set_id=set_id, + user_utterances=utterances, + title=body.title, + is_protected=body.is_protected, + meta={"source": "markdown_import"}, + ) + await self._db.commit() + await self._db.refresh(row) + return row + + async def import_json_case(self, body: ImportJsonCaseBody) -> EvalCase: + parent = await eval_repo.get_regression_set(self._db, body.regression_set_id) + if not parent: + raise EvaluationNotFoundError("regression set not found") + meta_extra: dict[str, Any] + if body.utterances: + utt = [str(u).strip() for u in body.utterances if str(u).strip()] + meta_extra = {} + elif body.raw_json is not None: + raw = body.raw_json + payload_str = json.dumps(raw, ensure_ascii=False) + utt, meta_extra = parse_script_json(payload_str) + else: + raise EvaluationBadRequestError("utterances or raw_json required") + if not utt: + raise EvaluationBadRequestError("empty utterances") + row = await eval_repo.create_case( + self._db, + regression_set_id=body.regression_set_id, + user_utterances=utt, + title=body.title, + is_protected=body.is_protected, + meta={"source": "json_import", **meta_extra}, + ) + await self._db.commit() + await self._db.refresh(row) + return row + + async def list_versions(self) -> list[EvalVersion]: + return await eval_repo.list_versions(self._db) + + async def create_version(self, body: VersionCreate) -> EvalVersion: + row = await eval_repo.create_version( + self._db, + name=body.name, + runner_kind=body.runner_kind, + config_json=body.config_json, + ) + await self._db.commit() + await self._db.refresh(row) + return row + + async def list_experiments(self, *, limit: int) -> list[EvalExperiment]: + return await eval_repo.list_experiments(self._db, limit=limit) + + async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment: + rs = await eval_repo.get_regression_set(self._db, body.regression_set_id) + if not rs: + raise EvaluationNotFoundError("regression set not found") + bv = await eval_repo.get_version(self._db, body.baseline_version_id) + cv = await eval_repo.get_version(self._db, body.candidate_version_id) + if not bv or not cv: + raise EvaluationNotFoundError("version not found") + row = await eval_repo.create_experiment( + self._db, + name=body.name, + regression_set_id=body.regression_set_id, + baseline_version_id=body.baseline_version_id, + candidate_version_id=body.candidate_version_id, + rubric_pack=body.rubric_pack, + composite_weights_json=body.composite_weights_json, + ) + await self._db.commit() + await self._db.refresh(row) + return row + + async def get_experiment_detail(self, experiment_id: str) -> ExperimentDetailBundle: + exp = await eval_repo.get_experiment(self._db, experiment_id) + if not exp: + raise EvaluationNotFoundError("experiment not found") + runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id) + run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] = [] + for r in runs: + turns = await eval_repo.list_turns(self._db, r.id) + run_rows.append((r, turns)) + gv = await eval_repo.get_gate_verdict(self._db, experiment_id) + return ExperimentDetailBundle(experiment=exp, run_rows=run_rows, gate=gv) + + async def enqueue_experiment_run(self, experiment_id: str) -> EvalExperiment: + exp = await eval_repo.get_experiment(self._db, experiment_id) + if not exp: + raise EvaluationNotFoundError("experiment not found") + run_eval_experiment_task.delay(experiment_id) + await self._db.refresh(exp) + return exp + + async def experiment_stream_snapshot( + self, experiment_id: str + ) -> dict[str, Any] | None: + from app.features.evaluation.presenters import run_out + from app.features.evaluation.schemas import GateVerdictOut + + exp = await eval_repo.get_experiment(self._db, experiment_id) + if not exp: + return None + runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id) + run_payload = [] + for r in runs: + turns = await eval_repo.list_turns(self._db, r.id) + run_payload.append(run_out(r, turns).model_dump()) + gv = await eval_repo.get_gate_verdict(self._db, experiment_id) + return { + "experiment_id": experiment_id, + "status": exp.status, + "runs": run_payload, + "gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None, + } diff --git a/api/app/features/evaluation/candidate_runner.py b/api/app/features/evaluation/candidate_runner.py new file mode 100644 index 0000000..4cba4e4 --- /dev/null +++ b/api/app/features/evaluation/candidate_runner.py @@ -0,0 +1,83 @@ +"""独立候选回放:多轮 user 链式调用 LLM,不走路由 WS / ChatOrchestrator。""" + +from __future__ import annotations + +import time +from typing import Any + +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +def _system_prompt_for_eval(version_config: dict | None) -> str: + cfg = version_config or {} + extra = (cfg.get("system_prompt_suffix") or "").strip() + base = """你是「岁月留书」老年友好访谈员。语气温暖、耐心,先承接情绪再 gently 追问事实与感受;回答简洁分段,避免术语。""" + if extra: + return f"{base}\n\n{extra}" + return base + + +def _model_override(version_config: dict | None) -> str | None: + if not version_config: + return None + m = version_config.get("model") + return str(m).strip() if m else None + + +class EvalCandidateRunner: + """使用 LangChain Chat 模型回放用户轮次。""" + + def __init__(self, llm: Any) -> None: + self._llm = llm + + async def replay_utterances( + self, + utterances: list[str], + *, + version_config: dict | None = None, + temperature: float = 0.7, + ) -> tuple[list[str], list[int]]: + """返回每轮 assistant 回复与耗时 ms。""" + if not self._llm: + raise RuntimeError(" replay: llm 未配置") + from langchain_core.messages import AIMessage, HumanMessage, SystemMessage + + sys_prompt = _system_prompt_for_eval(version_config) + model = _model_override(version_config) + lc_messages: list = [SystemMessage(content=sys_prompt)] + replies: list[str] = [] + latencies: list[int] = [] + bound = ( + self._llm.bind(model=model, temperature=temperature) + if model + else self._llm.bind(temperature=temperature) + ) + for u in utterances: + text = (u or "").strip() + if not text: + continue + lc_messages.append(HumanMessage(content=text)) + t0 = time.perf_counter() + result = await bound.ainvoke(lc_messages) + elapsed_ms = int((time.perf_counter() - t0) * 1000) + reply = str(getattr(result, "content", "") or "").strip() + replies.append(reply) + latencies.append(elapsed_ms) + lc_messages.append(AIMessage(content=reply)) + return replies, latencies + + +def simple_memoir_from_transcript(utterances: list[str], replies: list[str]) -> str: + """轻量成稿:供评审用占位(非生产叙事管线)。""" + lines = ["# 访谈摘录整理(评测占位稿)", ""] + for i, u in enumerate(utterances): + lines.append(f"## 片段 {i + 1}") + lines.append("") + lines.append(f"**用户:** {u.strip()}") + if i < len(replies): + lines.append("") + lines.append(f"**访谈者:** {replies[i].strip()}") + lines.append("") + return "\n".join(lines) diff --git a/api/app/features/evaluation/deps.py b/api/app/features/evaluation/deps.py new file mode 100644 index 0000000..abaaaad --- /dev/null +++ b/api/app/features/evaluation/deps.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from typing import Annotated + +from fastapi import Depends +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.db import get_async_db +from app.features.evaluation.admin_service import EvaluationAdminService + + +def get_evaluation_admin_service( + db: Annotated[AsyncSession, Depends(get_async_db)], +) -> EvaluationAdminService: + return EvaluationAdminService(db) diff --git a/api/app/features/evaluation/errors.py b/api/app/features/evaluation/errors.py new file mode 100644 index 0000000..67e9e16 --- /dev/null +++ b/api/app/features/evaluation/errors.py @@ -0,0 +1,13 @@ +"""评测 API 领域异常(由 router 映射为 HTTP 状态码)。""" + + +class EvaluationNotFoundError(Exception): + def __init__(self, detail: str = "not found") -> None: + self.detail = detail + super().__init__(detail) + + +class EvaluationBadRequestError(Exception): + def __init__(self, detail: str) -> None: + self.detail = detail + super().__init__(detail) diff --git a/api/app/features/evaluation/execution_service.py b/api/app/features/evaluation/execution_service.py new file mode 100644 index 0000000..39943ce --- /dev/null +++ b/api/app/features/evaluation/execution_service.py @@ -0,0 +1,252 @@ +"""执行单次评测 run 与整实验(供 Celery / 内联调试)。""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.config import settings +from app.core.db import AsyncSessionLocal +from app.core.dependencies import get_eval_judge_langchain_llm, get_llm_provider +from app.core.logging import get_logger +from app.features.evaluation import repo as eval_repo +from app.features.evaluation.candidate_runner import ( + EvalCandidateRunner, + simple_memoir_from_transcript, +) +from app.features.evaluation.gate_report_service import gate_result_to_details +from app.features.evaluation.gating_service import compute_gate +from app.features.evaluation.judge_service import EvalJudgeService +from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion + +logger = get_logger(__name__) + + +def _composite( + conv: float | None, mem: float | None, weights: dict[str, Any] | None +) -> float: + w = weights or {} + wc = float(w.get("conversation", 0.5)) + wm = float(w.get("memoir", 0.5)) + c = float(conv or 0) + m = float(mem or 0) + return wc * c + wm * m + + +def _utterances_for_case(case: EvalCase) -> list[str]: + raw = case.user_utterances or [] + return [str(u).strip() for u in raw if str(u).strip()] + + +async def execute_eval_run( + db: AsyncSession, + *, + run: EvalRun, + case: EvalCase, + version: EvalVersion, +) -> None: + if not settings.eval_execution_enabled: + await eval_repo.update_run( + db, + run, + status="failed", + error_message="EVAL_EXECUTION_ENABLED=false", + completed_at=datetime.now(timezone.utc), + ) + return + + utterances = _utterances_for_case(case) + if not utterances: + await eval_repo.update_run( + db, + run, + status="failed", + error_message="empty user_utterances", + completed_at=datetime.now(timezone.utc), + ) + return + + await eval_repo.update_run( + db, + run, + status="running", + started_at=datetime.now(timezone.utc), + error_message=None, + ) + await db.commit() + + provider_llm = getattr(get_llm_provider(), "langchain_llm", None) + if provider_llm is None: + await eval_repo.update_run( + db, + run, + status="failed", + error_message="生产 LLM 未配置", + completed_at=datetime.now(timezone.utc), + ) + await db.commit() + return + + judge_llm = get_eval_judge_langchain_llm() + judge = EvalJudgeService(judge_llm) + runner = EvalCandidateRunner(provider_llm) + cfg = version.config_json if isinstance(version.config_json, dict) else None + + try: + replies, latencies = await runner.replay_utterances( + utterances, + version_config=cfg, + temperature=settings.eval_candidate_temperature, + ) + except Exception as e: + logger.exception("eval replay failed: {}", e) + await eval_repo.update_run( + db, + run, + status="failed", + error_message=str(e)[:2000], + completed_at=datetime.now(timezone.utc), + ) + await db.commit() + return + + transcript_parts: list[str] = [] + for i, u in enumerate(utterances): + if i >= len(replies): + break + transcript_parts.append(f"用户: {u}\nAI: {replies[i]}") + prior = "" + for idx, u in enumerate(utterances): + if idx >= len(replies): + break + reply = replies[idx] + lat = latencies[idx] if idx < len(latencies) else None + tj = await judge.judge_turn( + prior_transcript=prior, + user_utterance=u, + assistant_reply=reply, + ) + scores = tj.model_dump() if tj else None + rationale = tj.rationale if tj else None + await eval_repo.add_turn( + db, + run_id=run.id, + turn_index=idx, + user_utterance=u, + assistant_reply=reply, + duration_ms=lat, + judge_scores_json=scores, + judge_rationale=rationale, + ) + await db.commit() + prior = (prior + f"\n用户: {u}\nAI: {reply}")[-8000:] + + full_transcript = "\n\n".join(transcript_parts) + conv_out = await judge.judge_conversation(full_transcript=full_transcript) + conv_total = conv_out.total_score if conv_out else None + + memoir_md = simple_memoir_from_transcript(utterances, replies) + mem_out = await judge.judge_memoir(memoir_markdown=memoir_md) + mem_total = mem_out.total_score if mem_out else None + + exp = await eval_repo.get_experiment(db, run.experiment_id) + weights = exp.composite_weights_json if exp else None + comp = _composite(conv_total, mem_total, weights) + + bundle: dict[str, Any] = { + "conversation_judge": conv_out.model_dump() if conv_out else None, + "memoir_judge": mem_out.model_dump() if mem_out else None, + } + await eval_repo.update_run( + db, + run, + status="completed", + memoir_markdown=memoir_md, + conversation_score_total=conv_total, + memoir_score_total=mem_total, + composite_score=comp, + judge_bundle_json=bundle, + completed_at=datetime.now(timezone.utc), + ) + await db.commit() + + +async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> None: + runs = await eval_repo.list_runs_for_experiment(db, experiment_id) + exp = await eval_repo.get_experiment(db, experiment_id) + if not exp: + return + cases = await eval_repo.list_cases(db, exp.regression_set_id) + + incomplete = [r for r in runs if r.status not in ("completed", "failed")] + if incomplete: + return + + failed = [r for r in runs if r.status == "failed"] + if failed: + await eval_repo.update_experiment( + db, + exp, + status="failed", + error_message="部分 run 失败", + completed_at=datetime.now(timezone.utc), + ) + await db.commit() + return + + gr = compute_gate(cases=cases, runs=runs) + await eval_repo.upsert_gate_verdict( + db, + experiment_id=experiment_id, + passed=gr.passed, + mean_composite_delta=gr.mean_delta, + protected_regressions_json=gr.protected_regressions, + details_json=gate_result_to_details(gr), + ) + await eval_repo.update_experiment( + db, + exp, + status="completed", + completed_at=datetime.now(timezone.utc), + ) + await db.commit() + + +async def execute_experiment_full(experiment_id: str) -> None: + async with AsyncSessionLocal() as db: + exp = await eval_repo.get_experiment(db, experiment_id) + if not exp: + return + await eval_repo.update_experiment(db, exp, status="running") + await db.commit() + + cases = await eval_repo.list_cases(db, exp.regression_set_id) + base_v = await eval_repo.get_version(db, exp.baseline_version_id) + cand_v = await eval_repo.get_version(db, exp.candidate_version_id) + if not base_v or not cand_v: + await eval_repo.update_experiment( + db, + exp, + status="failed", + error_message="version 不存在", + completed_at=datetime.now(timezone.utc), + ) + await db.commit() + return + + for case in cases: + for side, ver in ("baseline", base_v), ("candidate", cand_v): + run = await eval_repo.get_run(db, experiment_id, case.id, side) + if not run: + run = await eval_repo.create_run( + db, + experiment_id=experiment_id, + case_id=case.id, + side=side, + ) + await db.commit() + await execute_eval_run(db, run=run, case=case, version=ver) + + await _finalize_experiment_gate(db, experiment_id) diff --git a/api/app/features/evaluation/gate_report_service.py b/api/app/features/evaluation/gate_report_service.py new file mode 100644 index 0000000..6b812f1 --- /dev/null +++ b/api/app/features/evaluation/gate_report_service.py @@ -0,0 +1,17 @@ +"""门禁结果序列化(供 HTTP / UI)。""" + +from __future__ import annotations + +from typing import Any + +from app.features.evaluation.gating_service import GateResult + + +def gate_result_to_details(result: GateResult) -> dict[str, Any]: + return { + "mean_baseline_composite": result.mean_baseline_composite, + "mean_candidate_composite": result.mean_candidate_composite, + "mean_delta": result.mean_delta, + "per_case": result.per_case, + "protected_regressions": result.protected_regressions, + } diff --git a/api/app/features/evaluation/gating_service.py b/api/app/features/evaluation/gating_service.py new file mode 100644 index 0000000..cd51867 --- /dev/null +++ b/api/app/features/evaluation/gating_service.py @@ -0,0 +1,92 @@ +"""平台门禁:均分提升 + 受保护 session 无明显退步。""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from app.core.config import settings +from app.features.evaluation.models import EvalCase, EvalRun + + +@dataclass +class GateResult: + passed: bool + mean_baseline_composite: float + mean_candidate_composite: float + mean_delta: float + protected_regressions: list[dict[str, Any]] + per_case: list[dict[str, Any]] + + +def compute_gate( + *, + cases: list[EvalCase], + runs: list[EvalRun], + regression_threshold: float | None = None, +) -> GateResult: + thr = ( + regression_threshold + if regression_threshold is not None + else settings.eval_gate_protected_regression_threshold + ) + by_case: dict[str, dict[str, EvalRun]] = {} + for r in runs: + if r.status != "completed": + continue + by_case.setdefault(r.case_id, {})[r.side] = r + + per_case: list[dict[str, Any]] = [] + base_scores: list[float] = [] + cand_scores: list[float] = [] + protected_regs: list[dict[str, Any]] = [] + + case_map = {c.id: c for c in cases} + for cid, sides in by_case.items(): + b = sides.get("baseline") + c_run = sides.get("candidate") + if not b or not c_run: + continue + if b.composite_score is None or c_run.composite_score is None: + continue + bs = float(b.composite_score) + cs = float(c_run.composite_score) + delta = cs - bs + base_scores.append(bs) + cand_scores.append(cs) + ec = case_map.get(cid) + protected = bool(ec and ec.is_protected) + row = { + "case_id": cid, + "title": ec.title if ec else None, + "baseline_composite": bs, + "candidate_composite": cs, + "delta": delta, + "protected": protected, + } + per_case.append(row) + if protected and delta < -thr: + protected_regs.append( + { + "case_id": cid, + "title": ec.title if ec else None, + "delta": delta, + "threshold": thr, + } + ) + + mean_b = sum(base_scores) / len(base_scores) if base_scores else 0.0 + mean_c = sum(cand_scores) / len(cand_scores) if cand_scores else 0.0 + mean_delta = mean_c - mean_b + passed = ( + mean_c > mean_b + 1e-6 and len(protected_regs) == 0 and len(base_scores) > 0 + ) + + return GateResult( + passed=passed, + mean_baseline_composite=mean_b, + mean_candidate_composite=mean_c, + mean_delta=mean_delta, + protected_regressions=protected_regs, + per_case=per_case, + ) diff --git a/api/app/features/evaluation/importers/script_json.py b/api/app/features/evaluation/importers/script_json.py new file mode 100644 index 0000000..4b91f68 --- /dev/null +++ b/api/app/features/evaluation/importers/script_json.py @@ -0,0 +1,23 @@ +"""JSON 脚本导入评测用例。""" + +from __future__ import annotations + +import json +from typing import Any + + +def parse_script_json(raw: str | bytes) -> tuple[list[str], dict[str, Any]]: + data = json.loads(raw if isinstance(raw, str) else raw.decode("utf-8")) + if isinstance(data, list): + utterances = [str(x).strip() for x in data if str(x).strip()] + return utterances, {} + if isinstance(data, dict): + u = data.get("utterances") or data.get("user_utterances") or [] + if not isinstance(u, list): + raise ValueError("utterances 必须是数组") + utterances = [str(x).strip() for x in u if str(x).strip()] + meta = { + k: v for k, v in data.items() if k not in ("utterances", "user_utterances") + } + return utterances, meta + raise ValueError("根须为数组或对象") diff --git a/api/app/features/evaluation/importers/user_export_markdown.py b/api/app/features/evaluation/importers/user_export_markdown.py new file mode 100644 index 0000000..cc03580 --- /dev/null +++ b/api/app/features/evaluation/importers/user_export_markdown.py @@ -0,0 +1,19 @@ +"""从 extract_sql_to_user_md 产出的 Markdown 中提取用户轮次。""" + +from __future__ import annotations + +import re + + +def extract_user_utterances_from_export_md(text: str) -> list[str]: + """匹配 ``**用户:**`` 块之间的正文。""" + out: list[str] = [] + for m in re.finditer( + r"\*\*用户:\*\*\s*\n+(.+?)(?=\n+\*\*AI:\*\*|\n+####|\Z)", + text, + flags=re.DOTALL | re.IGNORECASE, + ): + chunk = (m.group(1) or "").strip() + if chunk and chunk != "(空)": + out.append(chunk) + return out diff --git a/api/app/features/evaluation/internal_auth.py b/api/app/features/evaluation/internal_auth.py new file mode 100644 index 0000000..96c64b9 --- /dev/null +++ b/api/app/features/evaluation/internal_auth.py @@ -0,0 +1,61 @@ +"""Internal evaluation API:共享密钥鉴权,不依赖终端用户 JWT。""" + +from typing import Annotated + +from fastapi import Depends, Header, HTTPException, status + +from app.core.config import settings +from app.core.logging import get_logger + +logger = get_logger(__name__) + +INTERNAL_HEADER = "X-Internal-Eval-Key" + + +class InternalEvalPrincipal: + """已通过内部密钥校验的调用方(占位,便于后续扩展多密钥/审计)。""" + + def __init__(self, *, key_id: str = "default") -> None: + self.key_id = key_id + + +def require_internal_eval_enabled() -> None: + if not (settings.internal_eval_api_key or "").strip(): + logger.warning("internal_eval_api_key 未配置,内部评测 API 拒绝访问") + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="内部评测服务未启用(缺少 INTERNAL_EVAL_API_KEY)", + ) + + +def verify_internal_eval_key( + *, + header_value: str | None = None, + query_value: str | None = None, +) -> InternalEvalPrincipal: + """Header 或 query(供 EventSource 等无法带头场景)。""" + require_internal_eval_enabled() + expected = (settings.internal_eval_api_key or "").strip() + if not expected: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="内部评测服务未启用", + ) + provided = (header_value or query_value or "").strip() + if not provided or provided != expected: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="无效的内部评测密钥", + ) + return InternalEvalPrincipal() + + +async def get_internal_eval_principal( + x_internal_eval_key: Annotated[str | None, Header(alias=INTERNAL_HEADER)] = None, +) -> InternalEvalPrincipal: + return verify_internal_eval_key(header_value=x_internal_eval_key) + + +InternalEvalAuth = Annotated[ + InternalEvalPrincipal, Depends(get_internal_eval_principal) +] diff --git a/api/app/features/evaluation/judge_schemas.py b/api/app/features/evaluation/judge_schemas.py new file mode 100644 index 0000000..05ddac2 --- /dev/null +++ b/api/app/features/evaluation/judge_schemas.py @@ -0,0 +1,41 @@ +"""评审 LLM 结构化输出(json_object)。""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class TurnJudgeOutput(BaseModel): + """单轮对话质量(情绪强化 rubric 子集 + 总分)。""" + + total_score: float = Field(ge=0, le=100) + emotion_score: float = Field(default=0, ge=0, le=30) + information_score: float = Field(default=0, ge=0, le=25) + structure_score: float = Field(default=0, ge=0, le=15) + question_score: float = Field(default=0, ge=0, le=15) + persona_score: float = Field(default=0, ge=0, le=15) + rationale: str = "" + + +class ConversationJudgeOutput(BaseModel): + """整条对话 transcript 的综合分。""" + + total_score: float = Field(ge=0, le=100) + dimension_scores: dict[str, float] = Field(default_factory=dict) + rationale: str = "" + + +class MemoirJudgeOutput(BaseModel): + """成稿回忆录评分。""" + + total_score: float = Field(ge=0, le=100) + authenticity_score: float = Field(default=0, ge=0, le=25) + information_score: float = Field(default=0, ge=0, le=15) + narrative_score: float = Field(default=0, ge=0, le=15) + language_score: float = Field(default=0, ge=0, le=20) + emotion_score: float = Field(default=0, ge=0, le=10) + character_score: float = Field(default=0, ge=0, le=10) + coherence_score: float = Field(default=0, ge=0, le=5) + richness_score: float = Field(default=0, ge=0, le=5) + publish_ready_score: float = Field(default=0, ge=0, le=5) + rationale: str = "" diff --git a/api/app/features/evaluation/judge_service.py b/api/app/features/evaluation/judge_service.py new file mode 100644 index 0000000..51569ac --- /dev/null +++ b/api/app/features/evaluation/judge_service.py @@ -0,0 +1,103 @@ +"""GLM 评审调用(结构化 JSON)。""" + +from __future__ import annotations + +from typing import Any + +from app.core.llm_call import LLMCallError, allm_json_call +from app.core.logging import get_logger +from app.features.evaluation.judge_schemas import ( + ConversationJudgeOutput, + MemoirJudgeOutput, + TurnJudgeOutput, +) +from app.features.evaluation.rubrics.conversation_v1 import ( + CONV_JUDGE_INSTRUCTIONS, + TURN_JUDGE_INSTRUCTIONS, +) +from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS + +logger = get_logger(__name__) + +_TURN_MAX = 768 +_CONV_MAX = 8192 +_MEMOIR_MAX = 12000 + + +class EvalJudgeService: + def __init__(self, judge_llm: Any | None) -> None: + self._llm = judge_llm + + async def judge_turn( + self, + *, + prior_transcript: str, + user_utterance: str, + assistant_reply: str, + ) -> TurnJudgeOutput | None: + if not self._llm: + return None + prompt = f"""{TURN_JUDGE_INSTRUCTIONS} + +【截至上一轮的对话摘要/节选】 +{prior_transcript[:_CONV_MAX]} + +【本轮用户】 +{user_utterance[:4000]} + +【本轮 AI】 +{assistant_reply[:4000]} +""" + try: + return await allm_json_call( + self._llm, + prompt, + TurnJudgeOutput, + max_tokens=_TURN_MAX, + agent="EvalJudgeService.judge_turn", + ) + except LLMCallError as e: + logger.warning("turn judge failed: {}", e) + return None + + async def judge_conversation( + self, *, full_transcript: str + ) -> ConversationJudgeOutput | None: + if not self._llm: + return None + prompt = f"""{CONV_JUDGE_INSTRUCTIONS} + +【完整对话】 +{full_transcript[:_CONV_MAX]} +""" + try: + return await allm_json_call( + self._llm, + prompt, + ConversationJudgeOutput, + max_tokens=_TURN_MAX, + agent="EvalJudgeService.judge_conversation", + ) + except LLMCallError as e: + logger.warning("conversation judge failed: {}", e) + return None + + async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None: + if not self._llm: + return None + prompt = f"""{MEMOIR_JUDGE_INSTRUCTIONS} + +【回忆录正文】 +{memoir_markdown[:_MEMOIR_MAX]} +""" + try: + return await allm_json_call( + self._llm, + prompt, + MemoirJudgeOutput, + max_tokens=_TURN_MAX, + agent="EvalJudgeService.judge_memoir", + ) + except LLMCallError as e: + logger.warning("memoir judge failed: {}", e) + return None diff --git a/api/app/features/evaluation/models.py b/api/app/features/evaluation/models.py new file mode 100644 index 0000000..a90b04a --- /dev/null +++ b/api/app/features/evaluation/models.py @@ -0,0 +1,194 @@ +"""ORM:内部回归评测(与生产 conversation 表隔离)。""" + +from __future__ import annotations + +from sqlalchemy import ( + JSON, + Boolean, + Column, + DateTime, + Float, + ForeignKey, + Integer, + String, + Text, + UniqueConstraint, +) +from sqlalchemy.orm import relationship + +from app.core.db import Base, utc_now + + +class EvalRegressionSet(Base): + __tablename__ = "eval_regression_sets" + + id = Column(String, primary_key=True) + name = Column(String, nullable=False) + description = Column(Text, nullable=True) + created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False) + + cases = relationship( + "EvalCase", + back_populates="regression_set", + cascade="all, delete-orphan", + ) + experiments = relationship("EvalExperiment", back_populates="regression_set") + + +class EvalCase(Base): + """从真实会话快照或导入脚本的不可变用户轮次列表。""" + + __tablename__ = "eval_cases" + + id = Column(String, primary_key=True) + regression_set_id = Column( + String, ForeignKey("eval_regression_sets.id"), nullable=False + ) + source_conversation_id = Column(String, nullable=True, index=True) + source_user_id = Column(String, nullable=True, index=True) + title = Column(String, nullable=True) + user_utterances = Column(JSON, nullable=False) + reference_memoir_markdown = Column(Text, nullable=True) + is_protected = Column( + Boolean, nullable=False, default=False, server_default="false" + ) + meta = Column(JSON, nullable=True) + created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False) + + regression_set = relationship("EvalRegressionSet", back_populates="cases") + runs = relationship("EvalRun", back_populates="case") + + +class EvalVersion(Base): + """基线或候选:回放配置(模型、system 补充等)。""" + + __tablename__ = "eval_versions" + + id = Column(String, primary_key=True) + name = Column(String, nullable=False) + runner_kind = Column(String, nullable=False, default="llm_chat_v1") + config_json = Column(JSON, nullable=True) + created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False) + + experiments_as_baseline = relationship( + "EvalExperiment", + foreign_keys="EvalExperiment.baseline_version_id", + back_populates="baseline_version", + ) + experiments_as_candidate = relationship( + "EvalExperiment", + foreign_keys="EvalExperiment.candidate_version_id", + back_populates="candidate_version", + ) + + +class EvalExperiment(Base): + __tablename__ = "eval_experiments" + + id = Column(String, primary_key=True) + name = Column(String, nullable=False) + regression_set_id = Column( + String, ForeignKey("eval_regression_sets.id"), nullable=False + ) + baseline_version_id = Column(String, ForeignKey("eval_versions.id"), nullable=False) + candidate_version_id = Column( + String, ForeignKey("eval_versions.id"), nullable=False + ) + rubric_pack = Column(String, nullable=False, default="conversation_v1+memoir_v1") + composite_weights_json = Column(JSON, nullable=True) + status = Column(String, nullable=False, default="pending") + error_message = Column(Text, nullable=True) + created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False) + completed_at = Column(DateTime(timezone=True), nullable=True) + + regression_set = relationship("EvalRegressionSet", back_populates="experiments") + baseline_version = relationship( + "EvalVersion", + foreign_keys=[baseline_version_id], + ) + candidate_version = relationship( + "EvalVersion", + foreign_keys=[candidate_version_id], + ) + runs = relationship( + "EvalRun", back_populates="experiment", cascade="all, delete-orphan" + ) + gate_verdict = relationship( + "EvalGateVerdict", + back_populates="experiment", + uselist=False, + cascade="all, delete-orphan", + ) + + +class EvalRun(Base): + """单次:某 experiment × 某 case × baseline 或 candidate。""" + + __tablename__ = "eval_runs" + + id = Column(String, primary_key=True) + experiment_id = Column(String, ForeignKey("eval_experiments.id"), nullable=False) + case_id = Column(String, ForeignKey("eval_cases.id"), nullable=False) + side = Column(String, nullable=False) + status = Column(String, nullable=False, default="pending") + error_message = Column(Text, nullable=True) + memoir_markdown = Column(Text, nullable=True) + conversation_score_total = Column(Float, nullable=True) + memoir_score_total = Column(Float, nullable=True) + composite_score = Column(Float, nullable=True) + judge_bundle_json = Column(JSON, nullable=True) + started_at = Column(DateTime(timezone=True), nullable=True) + completed_at = Column(DateTime(timezone=True), nullable=True) + + experiment = relationship("EvalExperiment", back_populates="runs") + case = relationship("EvalCase", back_populates="runs") + turns = relationship( + "EvalRunTurn", + back_populates="run", + cascade="all, delete-orphan", + order_by="EvalRunTurn.turn_index", + ) + + __table_args__ = ( + UniqueConstraint( + "experiment_id", + "case_id", + "side", + name="uq_eval_run_experiment_case_side", + ), + ) + + +class EvalRunTurn(Base): + __tablename__ = "eval_run_turns" + + id = Column(String, primary_key=True) + run_id = Column(String, ForeignKey("eval_runs.id"), nullable=False) + turn_index = Column(Integer, nullable=False) + user_utterance = Column(Text, nullable=False) + assistant_reply = Column(Text, nullable=True) + duration_ms = Column(Integer, nullable=True) + judge_scores_json = Column(JSON, nullable=True) + judge_rationale = Column(Text, nullable=True) + + run = relationship("EvalRun", back_populates="turns") + + __table_args__ = ( + UniqueConstraint("run_id", "turn_index", name="uq_eval_run_turn_index"), + ) + + +class EvalGateVerdict(Base): + __tablename__ = "eval_gate_verdicts" + + id = Column(String, primary_key=True) + experiment_id = Column( + String, ForeignKey("eval_experiments.id"), nullable=False, unique=True + ) + passed = Column(Boolean, nullable=False) + mean_composite_delta = Column(Float, nullable=True) + protected_regressions_json = Column(JSON, nullable=True) + details_json = Column(JSON, nullable=True) + computed_at = Column(DateTime(timezone=True), default=utc_now, nullable=False) + + experiment = relationship("EvalExperiment", back_populates="gate_verdict") diff --git a/api/app/features/evaluation/presenters.py b/api/app/features/evaluation/presenters.py new file mode 100644 index 0000000..7223cea --- /dev/null +++ b/api/app/features/evaluation/presenters.py @@ -0,0 +1,25 @@ +"""ORM → API schema 映射(供 REST / SSE 共用)。""" + +from __future__ import annotations + +from app.features.evaluation.schemas import CaseOut, EvalRunOut, RunTurnOut + + +def case_out(row) -> CaseOut: + return CaseOut.model_validate(row) + + +def run_out(row, turns: list) -> EvalRunOut: + return EvalRunOut( + id=row.id, + experiment_id=row.experiment_id, + case_id=row.case_id, + side=row.side, + status=row.status, + error_message=row.error_message, + memoir_markdown=row.memoir_markdown, + conversation_score_total=row.conversation_score_total, + memoir_score_total=row.memoir_score_total, + composite_score=row.composite_score, + turns=[RunTurnOut.model_validate(t) for t in turns], + ) diff --git a/api/app/features/evaluation/repo.py b/api/app/features/evaluation/repo.py new file mode 100644 index 0000000..70c4f0e --- /dev/null +++ b/api/app/features/evaluation/repo.py @@ -0,0 +1,315 @@ +"""评测域数据访问。""" + +from __future__ import annotations + +import uuid +from typing import Any + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.features.evaluation.models import ( + EvalCase, + EvalExperiment, + EvalGateVerdict, + EvalRegressionSet, + EvalRun, + EvalRunTurn, + EvalVersion, +) + + +def _id() -> str: + return str(uuid.uuid4()).replace("-", "")[:32] + + +async def list_regression_sets(db: AsyncSession) -> list[EvalRegressionSet]: + res = await db.execute( + select(EvalRegressionSet).order_by(EvalRegressionSet.created_at.desc()) + ) + return list(res.scalars().unique().all()) + + +async def get_regression_set(db: AsyncSession, sid: str) -> EvalRegressionSet | None: + return await db.get(EvalRegressionSet, sid) + + +async def create_regression_set( + db: AsyncSession, *, name: str, description: str | None = None +) -> EvalRegressionSet: + row = EvalRegressionSet(id=_id(), name=name, description=description) + db.add(row) + await db.flush() + return row + + +async def list_cases(db: AsyncSession, regression_set_id: str) -> list[EvalCase]: + res = await db.execute( + select(EvalCase) + .where(EvalCase.regression_set_id == regression_set_id) + .order_by(EvalCase.created_at.asc()) + ) + return list(res.scalars().all()) + + +async def create_case( + db: AsyncSession, + *, + regression_set_id: str, + user_utterances: list[str], + title: str | None = None, + source_conversation_id: str | None = None, + source_user_id: str | None = None, + reference_memoir_markdown: str | None = None, + is_protected: bool = False, + meta: dict[str, Any] | None = None, +) -> EvalCase: + row = EvalCase( + id=_id(), + regression_set_id=regression_set_id, + source_conversation_id=source_conversation_id, + source_user_id=source_user_id, + title=title, + user_utterances=list(user_utterances), + reference_memoir_markdown=reference_memoir_markdown, + is_protected=is_protected, + meta=meta, + ) + db.add(row) + await db.flush() + return row + + +async def get_case(db: AsyncSession, case_id: str) -> EvalCase | None: + return await db.get(EvalCase, case_id) + + +async def list_versions(db: AsyncSession) -> list[EvalVersion]: + res = await db.execute(select(EvalVersion).order_by(EvalVersion.created_at.desc())) + return list(res.scalars().all()) + + +async def create_version( + db: AsyncSession, + *, + name: str, + runner_kind: str = "llm_chat_v1", + config_json: dict[str, Any] | None = None, +) -> EvalVersion: + row = EvalVersion( + id=_id(), + name=name, + runner_kind=runner_kind, + config_json=config_json, + ) + db.add(row) + await db.flush() + return row + + +async def get_version(db: AsyncSession, vid: str) -> EvalVersion | None: + return await db.get(EvalVersion, vid) + + +async def create_experiment( + db: AsyncSession, + *, + name: str, + regression_set_id: str, + baseline_version_id: str, + candidate_version_id: str, + rubric_pack: str = "conversation_v1+memoir_v1", + composite_weights_json: dict[str, Any] | None = None, +) -> EvalExperiment: + row = EvalExperiment( + id=_id(), + name=name, + regression_set_id=regression_set_id, + baseline_version_id=baseline_version_id, + candidate_version_id=candidate_version_id, + rubric_pack=rubric_pack, + composite_weights_json=composite_weights_json, + status="pending", + ) + db.add(row) + await db.flush() + return row + + +async def get_experiment(db: AsyncSession, eid: str) -> EvalExperiment | None: + return await db.get(EvalExperiment, eid) + + +async def list_experiments(db: AsyncSession, limit: int = 50) -> list[EvalExperiment]: + res = await db.execute( + select(EvalExperiment).order_by(EvalExperiment.created_at.desc()).limit(limit) + ) + return list(res.scalars().all()) + + +async def update_experiment( + db: AsyncSession, + exp: EvalExperiment, + *, + status: str | None = None, + error_message: str | None = None, + completed_at: Any | None = ..., +) -> None: + if status is not None: + exp.status = status + if error_message is not None: + exp.error_message = error_message + if completed_at is not ...: + exp.completed_at = completed_at + + +async def get_run( + db: AsyncSession, experiment_id: str, case_id: str, side: str +) -> EvalRun | None: + res = await db.execute( + select(EvalRun).where( + EvalRun.experiment_id == experiment_id, + EvalRun.case_id == case_id, + EvalRun.side == side, + ) + ) + return res.scalars().first() + + +async def create_run( + db: AsyncSession, + *, + experiment_id: str, + case_id: str, + side: str, +) -> EvalRun: + row = EvalRun( + id=_id(), + experiment_id=experiment_id, + case_id=case_id, + side=side, + status="pending", + ) + db.add(row) + await db.flush() + return row + + +async def list_runs_for_experiment( + db: AsyncSession, experiment_id: str +) -> list[EvalRun]: + res = await db.execute( + select(EvalRun).where(EvalRun.experiment_id == experiment_id) + ) + return list(res.scalars().all()) + + +async def update_run( + db: AsyncSession, + run: EvalRun, + *, + status: str | None = None, + error_message: str | None = None, + memoir_markdown: str | None = None, + conversation_score_total: float | None = None, + memoir_score_total: float | None = None, + composite_score: float | None = None, + judge_bundle_json: dict[str, Any] | None = None, + started_at: Any | None = ..., + completed_at: Any | None = ..., +) -> None: + if status is not None: + run.status = status + if error_message is not None: + run.error_message = error_message + if memoir_markdown is not None: + run.memoir_markdown = memoir_markdown + if conversation_score_total is not None: + run.conversation_score_total = conversation_score_total + if memoir_score_total is not None: + run.memoir_score_total = memoir_score_total + if composite_score is not None: + run.composite_score = composite_score + if judge_bundle_json is not None: + run.judge_bundle_json = judge_bundle_json + if started_at is not ...: + run.started_at = started_at + if completed_at is not ...: + run.completed_at = completed_at + + +async def add_turn( + db: AsyncSession, + *, + run_id: str, + turn_index: int, + user_utterance: str, + assistant_reply: str | None, + duration_ms: int | None, + judge_scores_json: dict[str, Any] | None, + judge_rationale: str | None, +) -> EvalRunTurn: + row = EvalRunTurn( + id=_id(), + run_id=run_id, + turn_index=turn_index, + user_utterance=user_utterance, + assistant_reply=assistant_reply, + duration_ms=duration_ms, + judge_scores_json=judge_scores_json, + judge_rationale=judge_rationale, + ) + db.add(row) + await db.flush() + return row + + +async def list_turns(db: AsyncSession, run_id: str) -> list[EvalRunTurn]: + res = await db.execute( + select(EvalRunTurn) + .where(EvalRunTurn.run_id == run_id) + .order_by(EvalRunTurn.turn_index.asc()) + ) + return list(res.scalars().all()) + + +async def upsert_gate_verdict( + db: AsyncSession, + *, + experiment_id: str, + passed: bool, + mean_composite_delta: float | None, + protected_regressions_json: list[dict[str, Any]] | None, + details_json: dict[str, Any] | None, +) -> EvalGateVerdict: + res = await db.execute( + select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id) + ) + row = res.scalars().first() + if row: + row.passed = passed + row.mean_composite_delta = mean_composite_delta + row.protected_regressions_json = protected_regressions_json + row.details_json = details_json + await db.flush() + return row + row = EvalGateVerdict( + id=_id(), + experiment_id=experiment_id, + passed=passed, + mean_composite_delta=mean_composite_delta, + protected_regressions_json=protected_regressions_json, + details_json=details_json, + ) + db.add(row) + await db.flush() + return row + + +async def get_gate_verdict( + db: AsyncSession, experiment_id: str +) -> EvalGateVerdict | None: + res = await db.execute( + select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id) + ) + return res.scalars().first() diff --git a/api/app/features/evaluation/router.py b/api/app/features/evaluation/router.py new file mode 100644 index 0000000..848461b --- /dev/null +++ b/api/app/features/evaluation/router.py @@ -0,0 +1,268 @@ +"""内部评测 REST API。""" + +from __future__ import annotations + +from typing import Annotated + +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.db import get_async_db +from app.features.evaluation.admin_service import EvaluationAdminService +from app.features.evaluation.deps import get_evaluation_admin_service +from app.features.evaluation.errors import ( + EvaluationBadRequestError, + EvaluationNotFoundError, +) +from app.features.evaluation.internal_auth import InternalEvalAuth +from app.features.evaluation.presenters import case_out, run_out +from app.features.evaluation.schemas import ( + CaseCreate, + CaseOut, + EvalRunOut, + ExperimentCreate, + ExperimentDetailOut, + ExperimentOut, + GateVerdictOut, + ImportJsonCaseBody, + ImportMarkdownBody, + RegressionSetCreate, + RegressionSetOut, + SessionListItem, + SessionListResponse, + SessionTranscriptOut, + SnapshotFromConversationBody, + VersionCreate, + VersionOut, +) +from app.features.evaluation.session_catalog_service import SessionCatalogService + +router = APIRouter(tags=["internal-evaluation"]) + + +def _eval_http_exc( + e: EvaluationNotFoundError | EvaluationBadRequestError, +) -> HTTPException: + if isinstance(e, EvaluationNotFoundError): + return HTTPException(status_code=404, detail=e.detail) + return HTTPException(status_code=400, detail=e.detail) + + +@router.get("/regression-sets", response_model=list[RegressionSetOut]) +async def list_regression_sets( + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + rows = await svc.list_regression_sets() + return [RegressionSetOut.model_validate(r) for r in rows] + + +@router.post("/regression-sets", response_model=RegressionSetOut) +async def create_regression_set( + body: RegressionSetCreate, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + row = await svc.create_regression_set(body) + except (EvaluationNotFoundError, EvaluationBadRequestError) as e: + raise _eval_http_exc(e) from e + return RegressionSetOut.model_validate(row) + + +@router.get("/regression-sets/{set_id}/cases", response_model=list[CaseOut]) +async def list_cases( + set_id: str, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + rows = await svc.list_cases(set_id) + except EvaluationNotFoundError as e: + raise _eval_http_exc(e) from e + return [case_out(r) for r in rows] + + +@router.post("/regression-sets/{set_id}/cases", response_model=CaseOut) +async def create_case( + set_id: str, + body: CaseCreate, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + row = await svc.create_case(set_id, body) + except (EvaluationNotFoundError, EvaluationBadRequestError) as e: + raise _eval_http_exc(e) from e + return case_out(row) + + +@router.post( + "/regression-sets/{set_id}/snapshot-from-conversation/{conversation_id}", + response_model=CaseOut, +) +async def snapshot_from_conversation( + set_id: str, + conversation_id: str, + body: SnapshotFromConversationBody, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + row = await svc.snapshot_from_conversation(set_id, conversation_id, body) + except (EvaluationNotFoundError, EvaluationBadRequestError) as e: + raise _eval_http_exc(e) from e + return case_out(row) + + +@router.get("/sessions", response_model=SessionListResponse) +async def list_sessions( + _auth: InternalEvalAuth, + db: Annotated[AsyncSession, Depends(get_async_db)], + offset: int = Query(0, ge=0), + limit: int = Query(50, ge=1, le=200), + user_id: str | None = Query(None), + q: str | None = Query(None), +): + catalog = SessionCatalogService(db) + rows, total = await catalog.list_sessions( + offset=offset, limit=limit, user_id=user_id, q=q + ) + return SessionListResponse( + items=[ + SessionListItem( + id=r.id, + user_id=r.user_id, + started_at=r.started_at, + conversation_stage=r.conversation_stage, + current_topic=r.current_topic, + status=r.status, + ) + for r in rows + ], + total=total, + ) + + +@router.get( + "/sessions/{conversation_id}/transcript", response_model=SessionTranscriptOut +) +async def get_session_transcript( + conversation_id: str, + _auth: InternalEvalAuth, + db: Annotated[AsyncSession, Depends(get_async_db)], +): + catalog = SessionCatalogService(db) + tr = await catalog.get_transcript(conversation_id) + if not tr: + raise HTTPException(status_code=404, detail="conversation not found") + return SessionTranscriptOut( + conversation_id=tr.conversation_id, + user_id=tr.user_id, + user_utterances_from_segments=tr.user_utterances_from_segments, + user_utterances_from_messages=tr.user_utterances_from_messages, + ) + + +@router.post("/regression-sets/{set_id}/import-markdown", response_model=CaseOut) +async def import_markdown_case( + set_id: str, + body: ImportMarkdownBody, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + row = await svc.import_markdown_case(set_id, body) + except (EvaluationNotFoundError, EvaluationBadRequestError) as e: + raise _eval_http_exc(e) from e + return case_out(row) + + +@router.post("/import/json-case", response_model=CaseOut) +async def import_json_case( + body: ImportJsonCaseBody, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + row = await svc.import_json_case(body) + except (EvaluationNotFoundError, EvaluationBadRequestError) as e: + raise _eval_http_exc(e) from e + return case_out(row) + + +@router.get("/versions", response_model=list[VersionOut]) +async def list_versions( + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + rows = await svc.list_versions() + return [VersionOut.model_validate(r) for r in rows] + + +@router.post("/versions", response_model=VersionOut) +async def create_version( + body: VersionCreate, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + row = await svc.create_version(body) + except (EvaluationNotFoundError, EvaluationBadRequestError) as e: + raise _eval_http_exc(e) from e + return VersionOut.model_validate(row) + + +@router.get("/experiments", response_model=list[ExperimentOut]) +async def list_experiments( + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], + limit: int = Query(50, ge=1, le=200), +): + rows = await svc.list_experiments(limit=limit) + return [ExperimentOut.model_validate(r) for r in rows] + + +@router.post("/experiments", response_model=ExperimentOut) +async def create_experiment( + body: ExperimentCreate, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + row = await svc.create_experiment(body) + except (EvaluationNotFoundError, EvaluationBadRequestError) as e: + raise _eval_http_exc(e) from e + return ExperimentOut.model_validate(row) + + +@router.get("/experiments/{experiment_id}", response_model=ExperimentDetailOut) +async def get_experiment_detail( + experiment_id: str, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + bundle = await svc.get_experiment_detail(experiment_id) + except EvaluationNotFoundError as e: + raise _eval_http_exc(e) from e + run_outs: list[EvalRunOut] = [run_out(r, turns) for r, turns in bundle.run_rows] + gate = GateVerdictOut.model_validate(bundle.gate) if bundle.gate else None + return ExperimentDetailOut( + experiment=ExperimentOut.model_validate(bundle.experiment), + runs=run_outs, + gate=gate, + ) + + +@router.post("/experiments/{experiment_id}/run", response_model=ExperimentOut) +async def enqueue_experiment_run( + experiment_id: str, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + exp = await svc.enqueue_experiment_run(experiment_id) + except EvaluationNotFoundError as e: + raise _eval_http_exc(e) from e + return ExperimentOut.model_validate(exp) diff --git a/api/app/features/evaluation/rubrics/conversation_v1.py b/api/app/features/evaluation/rubrics/conversation_v1.py new file mode 100644 index 0000000..3adbaf8 --- /dev/null +++ b/api/app/features/evaluation/rubrics/conversation_v1.py @@ -0,0 +1,20 @@ +"""对话评审 rubric 文本(v1)。""" + +TURN_JUDGE_INSTRUCTIONS = """你是「岁月留书」访谈对话质量评审。根据下面维度给本轮 AI 回复打分(0-100 为 total_score,各子分上限已注明,总和应合理)。 + +维度(参考): +- 情绪承接与共情(emotion_score,最高 30) +- 信息获取与追问(information_score,最高 25) +- 结构化访谈推进(structure_score,最高 15) +- 提问质量(question_score,最高 15) +- 人物理解与一致性(persona_score,最高 15) + +输出 JSON:**json** 字段名如下: +total_score, emotion_score, information_score, structure_score, question_score, persona_score, rationale + +只输出 JSON。""" + + +CONV_JUDGE_INSTRUCTIONS = """你是访谈整段对话评审。给定完整 transcript(用户与 AI 多轮),打一个综合 total_score(0-100),并给出 dimension_scores 对象(可为空对象),以及 rationale。 + +只输出 JSON:total_score, dimension_scores, rationale。""" diff --git a/api/app/features/evaluation/rubrics/memoir_v1.py b/api/app/features/evaluation/rubrics/memoir_v1.py new file mode 100644 index 0000000..5bfd1ea --- /dev/null +++ b/api/app/features/evaluation/rubrics/memoir_v1.py @@ -0,0 +1,11 @@ +"""回忆录成稿评审 rubric 文本(v1)。""" + +MEMOIR_JUDGE_INSTRUCTIONS = """你是「岁月留书」回忆录成稿评审。根据真实性与覆盖、信息质量、叙事结构、语言文笔、情感、人物、连贯性、表达丰富度、出版就绪等,给出分项分(上限与 total_score 满分 100 一致)。 + +输出 JSON 字段: +total_score, +authenticity_score, information_score, narrative_score, language_score, +emotion_score, character_score, coherence_score, richness_score, publish_ready_score, +rationale + +只输出 JSON。""" diff --git a/api/app/features/evaluation/schemas.py b/api/app/features/evaluation/schemas.py new file mode 100644 index 0000000..86135e3 --- /dev/null +++ b/api/app/features/evaluation/schemas.py @@ -0,0 +1,176 @@ +"""HTTP / OpenAPI 模型。""" + +from __future__ import annotations + +from datetime import datetime +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + + +class RegressionSetCreate(BaseModel): + name: str + description: str | None = None + + +class RegressionSetOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: str + name: str + description: str | None + created_at: datetime + + +class CaseCreate(BaseModel): + title: str | None = None + user_utterances: list[str] + source_conversation_id: str | None = None + source_user_id: str | None = None + reference_memoir_markdown: str | None = None + is_protected: bool = False + meta: dict[str, Any] | None = None + + +class CaseOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: str + regression_set_id: str + source_conversation_id: str | None + source_user_id: str | None + title: str | None + user_utterances: list[Any] + is_protected: bool + created_at: datetime + + +class VersionCreate(BaseModel): + name: str + runner_kind: str = "llm_chat_v1" + config_json: dict[str, Any] | None = None + + +class VersionOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: str + name: str + runner_kind: str + config_json: dict[str, Any] | None + created_at: datetime + + +class ExperimentCreate(BaseModel): + name: str + regression_set_id: str + baseline_version_id: str + candidate_version_id: str + rubric_pack: str = "conversation_v1+memoir_v1" + composite_weights_json: dict[str, Any] | None = Field( + default=None, + description='默认 {"conversation":0.5,"memoir":0.5}', + ) + + +class ExperimentOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: str + name: str + regression_set_id: str + baseline_version_id: str + candidate_version_id: str + rubric_pack: str + status: str + error_message: str | None + created_at: datetime + completed_at: datetime | None + + +class SessionListItem(BaseModel): + id: str + user_id: str + started_at: datetime | None + conversation_stage: str | None + current_topic: str | None + status: str | None + + +class SessionListResponse(BaseModel): + items: list[SessionListItem] + total: int + + +class SessionTranscriptOut(BaseModel): + conversation_id: str + user_id: str + user_utterances_from_segments: list[str] + user_utterances_from_messages: list[str] + + +class SnapshotFromConversationBody(BaseModel): + title: str | None = None + use_messages: bool = False + is_protected: bool = False + + +class ImportMarkdownBody(BaseModel): + markdown: str + title: str | None = None + is_protected: bool = False + + +class ImportJsonCaseBody(BaseModel): + regression_set_id: str + utterances: list[str] | None = None + raw_json: dict[str, Any] | list[Any] | None = Field( + default=None, + description="与 utterances 二选一:对象含 utterances 键或根数组", + ) + title: str | None = None + is_protected: bool = False + + +class RunTurnOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: str + turn_index: int + user_utterance: str + assistant_reply: str | None + duration_ms: int | None + judge_scores_json: dict[str, Any] | None + judge_rationale: str | None + + +class EvalRunOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: str + experiment_id: str + case_id: str + side: str + status: str + error_message: str | None + memoir_markdown: str | None + conversation_score_total: float | None + memoir_score_total: float | None + composite_score: float | None + turns: list[RunTurnOut] = [] + + +class GateVerdictOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + passed: bool + mean_composite_delta: float | None + protected_regressions_json: list[dict[str, Any]] | None + details_json: dict[str, Any] | None + computed_at: datetime + + +class ExperimentDetailOut(BaseModel): + experiment: ExperimentOut + runs: list[EvalRunOut] + gate: GateVerdictOut | None diff --git a/api/app/features/evaluation/session_catalog_repo.py b/api/app/features/evaluation/session_catalog_repo.py new file mode 100644 index 0000000..9af2eb1 --- /dev/null +++ b/api/app/features/evaluation/session_catalog_repo.py @@ -0,0 +1,70 @@ +"""只读访问生产对话表,供内部浏览与快照。""" + +from __future__ import annotations + +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.features.conversation.models import Conversation, ConversationMessage, Segment + + +class SessionCatalogRepo: + def __init__(self, db: AsyncSession) -> None: + self._db = db + + async def count_conversations(self) -> int: + q = ( + select(func.count()) + .select_from(Conversation) + .where(Conversation.deleted_at.is_(None)) + ) + r = await self._db.execute(q) + return int(r.scalar() or 0) + + async def list_conversations( + self, + *, + offset: int = 0, + limit: int = 50, + user_id: str | None = None, + q_text: str | None = None, + ) -> list[Conversation]: + stmt = select(Conversation).where(Conversation.deleted_at.is_(None)) + if user_id: + stmt = stmt.where(Conversation.user_id == user_id) + stmt = stmt.order_by(Conversation.started_at.desc().nullslast()) + stmt = stmt.offset(offset).limit(limit) + # q_text: 简单按 topic 搜索(后续可扩展全文) + if q_text: + like = f"%{q_text.strip()}%" + stmt = stmt.where( + (Conversation.current_topic.isnot(None)) + & (Conversation.current_topic.ilike(like)) + ) + res = await self._db.execute(stmt) + return list(res.scalars().unique().all()) + + async def get_conversation(self, conversation_id: str) -> Conversation | None: + return await self._db.get(Conversation, conversation_id) + + async def list_segments_for_conversation( + self, conversation_id: str + ) -> list[Segment]: + stmt = ( + select(Segment) + .where(Segment.conversation_id == conversation_id) + .order_by(Segment.created_at.asc()) + ) + res = await self._db.execute(stmt) + return list(res.scalars().all()) + + async def list_messages_for_conversation( + self, conversation_id: str + ) -> list[ConversationMessage]: + stmt = ( + select(ConversationMessage) + .where(ConversationMessage.conversation_id == conversation_id) + .order_by(ConversationMessage.created_at.asc()) + ) + res = await self._db.execute(stmt) + return list(res.scalars().all()) diff --git a/api/app/features/evaluation/session_catalog_service.py b/api/app/features/evaluation/session_catalog_service.py new file mode 100644 index 0000000..f3d3da2 --- /dev/null +++ b/api/app/features/evaluation/session_catalog_service.py @@ -0,0 +1,80 @@ +"""历史 session 目录:列表 + 导出为用户轮次列表(用于评测快照)。""" + +from __future__ import annotations + +from dataclasses import dataclass + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.features.evaluation.session_catalog_repo import SessionCatalogRepo + + +@dataclass +class SessionSummary: + id: str + user_id: str + started_at: object | None + conversation_stage: str | None + current_topic: str | None + status: str | None + + +@dataclass +class SessionTranscript: + conversation_id: str + user_id: str + user_utterances_from_segments: list[str] + user_utterances_from_messages: list[str] + + +class SessionCatalogService: + def __init__(self, db: AsyncSession) -> None: + self._repo = SessionCatalogRepo(db) + + async def list_sessions( + self, + *, + offset: int = 0, + limit: int = 50, + user_id: str | None = None, + q: str | None = None, + ) -> tuple[list[SessionSummary], int]: + total = await self._repo.count_conversations() + rows = await self._repo.list_conversations( + offset=offset, limit=limit, user_id=user_id, q_text=q + ) + out = [ + SessionSummary( + id=c.id, + user_id=c.user_id, + started_at=c.started_at, + conversation_stage=c.conversation_stage, + current_topic=c.current_topic, + status=c.status, + ) + for c in rows + ] + return out, total + + async def get_transcript(self, conversation_id: str) -> SessionTranscript | None: + c = await self._repo.get_conversation(conversation_id) + if not c or c.deleted_at: + return None + segs = await self._repo.list_segments_for_conversation(conversation_id) + msgs = await self._repo.list_messages_for_conversation(conversation_id) + from_segments = [ + (s.user_input_text or "").strip() + for s in segs + if (s.user_input_text or "").strip() + ] + from_messages = [ + m.content.strip() + for m in msgs + if m.role == "human" and (m.content or "").strip() + ] + return SessionTranscript( + conversation_id=c.id, + user_id=c.user_id, + user_utterances_from_segments=from_segments, + user_utterances_from_messages=from_messages, + ) diff --git a/api/app/features/evaluation/stream_router.py b/api/app/features/evaluation/stream_router.py new file mode 100644 index 0000000..621e67a --- /dev/null +++ b/api/app/features/evaluation/stream_router.py @@ -0,0 +1,45 @@ +"""实验进度 SSE(轮询 DB,轻量实现)。""" + +from __future__ import annotations + +import asyncio +import json + +from fastapi import APIRouter, Header, Query +from fastapi.responses import StreamingResponse + +from app.core.db import AsyncSessionLocal +from app.features.evaluation.admin_service import EvaluationAdminService +from app.features.evaluation.internal_auth import verify_internal_eval_key + +router = APIRouter(tags=["internal-evaluation-stream"]) + + +@router.get("/experiments/{experiment_id}/stream") +async def experiment_event_stream( + experiment_id: str, + key: str | None = Query( + default=None, + description="等同 X-Internal-Eval-Key,供 EventSource 使用", + ), + x_internal_eval_key: str | None = Header(default=None, alias="X-Internal-Eval-Key"), +): + verify_internal_eval_key( + header_value=x_internal_eval_key, + query_value=key, + ) + + async def event_gen(): + while True: + async with AsyncSessionLocal() as session: + svc = EvaluationAdminService(session) + payload = await svc.experiment_stream_snapshot(experiment_id) + if payload is None: + yield f"data: {json.dumps({'error': 'not_found'})}\n\n" + break + yield f"data: {json.dumps(payload, default=str)}\n\n" + if payload.get("status") in ("completed", "failed"): + break + await asyncio.sleep(1.0) + + return StreamingResponse(event_gen(), media_type="text/event-stream") diff --git a/api/app/internal_main.py b/api/app/internal_main.py new file mode 100644 index 0000000..666822c --- /dev/null +++ b/api/app/internal_main.py @@ -0,0 +1,96 @@ +""" +内部回归评测 API 入口:与 app.main 进程隔离部署。 + +启动示例(在 api/ 目录):: + + uv run uvicorn app.internal_main:internal_app --host 0.0.0.0 --port 8001 +""" + +from __future__ import annotations + +from pathlib import Path + +from app.core.logging import get_logger, setup_logging + +setup_logging() + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles + +from app.core.config import settings +from app.core.errors import register_exception_handlers +from app.core.middleware import RequestIdMiddleware +from app.features.evaluation import models as _eval_models # noqa: F401 +from app.features.evaluation.router import router as eval_router +from app.features.evaluation.stream_router import router as eval_stream_router + +logger = get_logger(__name__) + +internal_app = FastAPI( + title="Life Echo Internal Evaluation API", + version="0.1.0", + docs_url="/docs" if settings.internal_eval_enable_docs else None, + redoc_url="/redoc" if settings.internal_eval_enable_docs else None, + openapi_url="/openapi.json" if settings.internal_eval_enable_docs else None, +) + +internal_app.add_middleware(RequestIdMiddleware) +_origins = [ + o.strip() + for o in (settings.internal_eval_cors_origins or "").split(",") + if o.strip() +] +# 浏览器不允许 Origin=* 与 credentials 同时出现;未配置显式白名单时关闭 credentials。 +_allow_creds = bool(_origins) +internal_app.add_middleware( + CORSMiddleware, + allow_origins=_origins if _origins else ["*"], + allow_credentials=_allow_creds, + allow_methods=["*"], + allow_headers=["*"], +) +register_exception_handlers(internal_app) + + +@internal_app.on_event("startup") +async def _startup(): + import asyncio + + from app.core.alembic_startup import run_alembic_upgrade_at_startup + + logger.info("内部评测 API 启动中…") + await asyncio.to_thread(run_alembic_upgrade_at_startup) + try: + from app.core.redis import redis_service + + await redis_service.get_client() + logger.info("Redis 已连接(评测任务可用)") + except Exception as e: + logger.warning("Redis 连接失败: {}", e) + + +@internal_app.on_event("shutdown") +async def _shutdown(): + logger.info("内部评测 API 关闭中…") + try: + from app.core.redis import redis_service + + await redis_service.close() + except Exception as e: + logger.warning("关闭 Redis 失败: {}", e) + + +internal_app.include_router(eval_router, prefix="/internal/api/evaluation") +internal_app.include_router(eval_stream_router, prefix="/internal/api/evaluation") + +_static_dir = Path(__file__).resolve().parent.parent / "static" +if _static_dir.is_dir(): + internal_app.mount( + "/static", StaticFiles(directory=str(_static_dir)), name="static" + ) + + +@internal_app.get("/health", include_in_schema=False) +async def health(): + return {"status": "ok", "service": "internal-eval"} diff --git a/api/app/tasks/celery_app.py b/api/app/tasks/celery_app.py index a4d1aa4..0a61827 100644 --- a/api/app/tasks/celery_app.py +++ b/api/app/tasks/celery_app.py @@ -16,6 +16,7 @@ from app.core.config import settings from app.features.asset import models as _asset_models # noqa: F401 - register Asset from app.features.auth import models as _auth_models # noqa: F401 from app.features.conversation import models as _conv_models # noqa: F401 +from app.features.evaluation import models as _eval_models # noqa: F401 from app.features.memoir import models as _memoir_models # noqa: F401 from app.features.memory import models as _memory_models # noqa: F401 from app.features.payment import models as _payment_models # noqa: F401 @@ -35,6 +36,7 @@ celery_app = Celery( "app.tasks.chapter_cover_tasks", "app.tasks.chapter_compose_tasks", "app.tasks.memory_compaction_tasks", + "app.tasks.evaluation_tasks", ], ) diff --git a/api/app/tasks/evaluation_tasks.py b/api/app/tasks/evaluation_tasks.py new file mode 100644 index 0000000..29e88d3 --- /dev/null +++ b/api/app/tasks/evaluation_tasks.py @@ -0,0 +1,20 @@ +"""评测实验 Celery 任务。""" + +from __future__ import annotations + +import asyncio + +from celery import shared_task + +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +@shared_task(name="evaluation.run_experiment") +def run_eval_experiment_task(experiment_id: str) -> None: + from app.features.evaluation.execution_service import execute_experiment_full + + logger.info("evaluation task start experiment_id={}", experiment_id) + asyncio.run(execute_experiment_full(experiment_id)) + logger.info("evaluation task done experiment_id={}", experiment_id) diff --git a/api/docs/internal-eval.md b/api/docs/internal-eval.md new file mode 100644 index 0000000..c34e967 --- /dev/null +++ b/api/docs/internal-eval.md @@ -0,0 +1,45 @@ +# 内部回归评测平台 + +与主 API(`app/main.py`)隔离进程部署,避免评测候选链路透出给消费者 App。 + +## 启动 + +数据库与主服务共用;需配置环境变量后启动专用进程: + +```bash +cd api +export INTERNAL_EVAL_API_KEY='your-long-random-secret' +export INTERNAL_EVAL_ENABLE_DOCS=1 # 可选,开 /docs +# GLM 评审(默认复用智谱 key,也可单独配置) +export EVAL_JUDGE_API_KEY='...' # 可选,默认 ZHIPU_API_KEY +export EVAL_JUDGE_MODEL='glm-4-flash' + +uv run uvicorn app.internal_main:internal_app --host 0.0.0.0 --port 8001 +``` + +Celery worker 需已包含 `app.tasks.evaluation_tasks`(仓库 `celery_app.include` 已注册)。跑实验前: + +```bash +uv run celery -A app.tasks.celery_app worker -l info +``` + +## 前端(`app-eval-web`) + +```bash +cd app-eval-web +npm install +VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=与上同 npm run dev +``` + +或使用仓库根目录 `npm run eval-web`(需本地已 `npm install` 在 `app-eval-web`)。 + +## SSE / EventSource + +浏览器 `EventSource` 无法带自定义 Header,流式端点支持 **query** `?key=`,与 `X-Internal-Eval-Key` 等效。 + +## 门禁规则(v1) + +- 所有 case 的合成均分:候选须 **严格高于** 基线。 +- `is_protected=true` 的 case:合成份跌幅不得超过 `EVAL_GATE_PROTECTED_REGRESSION_THRESHOLD`(默认 2 分)。 + +结果写入 `eval_gate_verdicts`,不影响 `git`;后续可接 pre-commit / CI。 diff --git a/api/pyproject.toml b/api/pyproject.toml index 0784ce8..cf056f0 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -55,6 +55,7 @@ ignore = ["E501", "B008", "E712"] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401"] "main.py" = ["E402", "I001"] +"internal_main.py" = ["E402", "I001"] "app/tasks/celery_app.py" = ["E402"] "tests/conftest.py" = ["E402", "I001"] diff --git a/api/scripts/extract_sql_to_user_md.py b/api/scripts/extract_sql_to_user_md.py index f687a4b..437bbf2 100644 --- a/api/scripts/extract_sql_to_user_md.py +++ b/api/scripts/extract_sql_to_user_md.py @@ -1,17 +1,19 @@ #!/usr/bin/env python3 """ One-off: read life-echo pg_dump COPY data, emit one markdown per user -(conversations + segments + memoir chapters). +(conversations + segments + stories / story_versions + memoir chapters). Run from api/ (recommended, matches uv venv): uv run python scripts/extract_sql_to_user_md.py uv run python scripts/extract_sql_to_user_md.py ./tests/data/dump.sql uv run python scripts/extract_sql_to_user_md.py --all ./tests/data/ + uv run python scripts/extract_sql_to_user_md.py --only 15366015180 ./backups/dev.sql Only users listed in EXPORT_USER_KEYS (below) are exported by default. +`--only` may be repeated; it overrides EXPORT_USER_KEYS for that run. Use --all to export everyone. Use () for EXPORT_USER_KEYS to mean “all”. -Users with no conversation rows in the dump are skipped (no file written). +Users are skipped only when they have no conversations, stories, or chapters in the dump. Sessions that have no segments rows are omitted (no empty “### 会话” stubs). If you pass a directory, the newest *.sql inside is used. @@ -141,17 +143,25 @@ def parse_conversations(lines: list[str]) -> list[dict]: def parse_segments(lines: list[str]) -> list[dict]: + """Legacy dumps: 8 cols ending in agent_response. Current schema: 12 cols (see Segment model).""" out: list[dict] = [] for ln in lines: parts = ln.split("\t") if len(parts) < 8: continue sid, cid, audio = parts[0], parts[1], parts[2] - agent_response = unescape_pg_text(parts[-1]) if parts[-1] != r"\N" else "" - topic_category = parts[-2] if parts[-2] != r"\N" else "" - processed = parts[-3] - created_at = parts[-4] - transcript = unescape_pg_text("\t".join(parts[3:-4])) + if len(parts) >= 12: + transcript = unescape_pg_text("\t".join(parts[3:-8])) + created_at = parts[-7] + processed = parts[-6] + topic_category = parts[-5] if parts[-5] != r"\N" else "" + agent_response = unescape_pg_text(parts[-4]) if parts[-4] != r"\N" else "" + else: + agent_response = unescape_pg_text(parts[-1]) if parts[-1] != r"\N" else "" + topic_category = parts[-2] if parts[-2] != r"\N" else "" + processed = parts[-3] + created_at = parts[-4] + transcript = unescape_pg_text("\t".join(parts[3:-4])) out.append( { "id": sid, @@ -167,6 +177,98 @@ def parse_segments(lines: list[str]) -> list[dict]: return out +def parse_stories(lines: list[str]) -> list[dict]: + """COPY columns: id, user_id, title, stage, story_type, summary, canonical_markdown, + time_start..updated_at — 17 fields; canonical_markdown may contain tabs. + """ + out: list[dict] = [] + tail = 10 # time_start through updated_at + for ln in lines: + parts = ln.split("\t") + if len(parts) < 17: + continue + sid, uid, title, stage, story_type = ( + parts[0], + parts[1], + parts[2], + parts[3] if parts[3] != r"\N" else "", + parts[4] if parts[4] != r"\N" else "", + ) + summary = unescape_pg_text(parts[5]) if parts[5] != r"\N" else "" + canonical = unescape_pg_text("\t".join(parts[6:-tail])) + time_start = parts[-10] if parts[-10] != r"\N" else "" + time_end = parts[-9] if parts[-9] != r"\N" else "" + people_refs = parts[-8] if parts[-8] != r"\N" else "" + place_refs = parts[-7] if parts[-7] != r"\N" else "" + tag_refs = parts[-6] if parts[-6] != r"\N" else "" + status = parts[-5] if parts[-5] != r"\N" else "" + confidence = parts[-4] if parts[-4] != r"\N" else "" + current_version_id = parts[-3] if parts[-3] != r"\N" else "" + created_at, updated_at = parts[-2], parts[-1] + out.append( + { + "id": sid, + "user_id": uid, + "title": title, + "stage": stage, + "story_type": story_type, + "summary": summary, + "canonical_markdown": canonical, + "time_start": time_start, + "time_end": time_end, + "people_refs": people_refs, + "place_refs": place_refs, + "tag_refs": tag_refs, + "status": status, + "confidence": confidence, + "current_version_id": current_version_id, + "created_at": created_at, + "updated_at": updated_at, + } + ) + return out + + +def parse_story_versions(lines: list[str]) -> list[dict]: + """COPY: id, story_id, version_no, markdown_snapshot, change_summary, actor_type, + source_type, parent_version_id, prompt_meta, created_at — markdown_snapshot may contain tabs. + """ + out: list[dict] = [] + tail = 6 # change_summary .. created_at + for ln in lines: + parts = ln.split("\t") + if len(parts) < 10: + continue + vid, story_id = parts[0], parts[1] + version_no_raw = parts[2] + try: + version_no = int(version_no_raw) + except ValueError: + version_no = 0 + markdown_snapshot = unescape_pg_text("\t".join(parts[3:-tail])) + change_summary = unescape_pg_text(parts[-6]) if parts[-6] != r"\N" else "" + actor_type = parts[-5] if parts[-5] != r"\N" else "" + source_type = parts[-4] if parts[-4] != r"\N" else "" + parent_version_id = parts[-3] if parts[-3] != r"\N" else "" + prompt_meta = parts[-2] if parts[-2] != r"\N" else "" + created_at = parts[-1] + out.append( + { + "id": vid, + "story_id": story_id, + "version_no": version_no, + "markdown_snapshot": markdown_snapshot, + "change_summary": change_summary, + "actor_type": actor_type, + "source_type": source_type, + "parent_version_id": parent_version_id, + "prompt_meta": prompt_meta, + "created_at": created_at, + } + ) + return out + + def parse_chapters(lines: list[str]) -> list[dict]: """12 columns; content may contain tabs — unpack from the right.""" out: list[dict] = [] @@ -226,21 +328,33 @@ def resolve_sql_arg(raw: str | None) -> Path: return p -def _parse_cli_args(argv: list[str]) -> tuple[Path | None, bool]: - """Returns (sql_path_arg or None, export_all).""" +def _parse_cli_args(argv: list[str]) -> tuple[Path | None, bool, tuple[str, ...]]: + """Returns (sql_path_arg or None, export_all, only_keys or ()).""" export_all = False - rest: list[str] = [] - for a in argv: + only_keys: list[str] = [] + positional: list[str] = [] + i = 0 + while i < len(argv): + a = argv[i] if a == "--all": export_all = True + i += 1 + elif a == "--only": + if i + 1 >= len(argv): + sys.exit("--only requires a value (user id or phone)") + only_keys.append(argv[i + 1]) + i += 2 else: - rest.append(a) - sql_arg = rest[0] if rest else None - return sql_arg, export_all + positional.append(a) + i += 1 + if len(positional) > 1: + sys.exit("Extra arguments: pass at most one SQL path or directory.") + sql_arg = positional[0] if positional else None + return sql_arg, export_all, tuple(only_keys) def main() -> None: - sql_arg, export_all_flag = _parse_cli_args(sys.argv[1:]) + sql_arg, export_all_flag, only_keys_cli = _parse_cli_args(sys.argv[1:]) sql_path = resolve_sql_arg(sql_arg) if not sql_path.is_file(): print(f"Missing SQL file: {sql_path}") @@ -251,6 +365,8 @@ def main() -> None: users = parse_users(extract_copy_block(text, "users")) conversations = parse_conversations(extract_copy_block(text, "conversations")) segments = parse_segments(extract_copy_block(text, "segments")) + stories = parse_stories(extract_copy_block(text, "stories")) + story_versions = parse_story_versions(extract_copy_block(text, "story_versions")) chapters = parse_chapters(extract_copy_block(text, "chapters")) conv_by_user: dict[str, list[dict]] = defaultdict(list) @@ -261,13 +377,26 @@ def main() -> None: for s in segments: seg_by_conv[s["conversation_id"]].append(s) + stories_by_user: dict[str, list[dict]] = defaultdict(list) + for st in stories: + stories_by_user[st["user_id"]].append(st) + + versions_by_story: dict[str, list[dict]] = defaultdict(list) + for v in story_versions: + versions_by_story[v["story_id"]].append(v) + chap_by_user: dict[str, list[dict]] = defaultdict(list) for ch in chapters: chap_by_user[ch["user_id"]].append(ch) - active_keys: tuple[str, ...] = ( - () if (export_all_flag or not EXPORT_USER_KEYS) else EXPORT_USER_KEYS - ) + if export_all_flag and only_keys_cli: + sys.exit("Cannot combine --all with --only") + if only_keys_cli: + active_keys = only_keys_cli + else: + active_keys = ( + () if (export_all_flag or not EXPORT_USER_KEYS) else EXPORT_USER_KEYS + ) users_to_write = { uid: u for uid, u in users.items() if user_matches_export_keys(u, active_keys) } @@ -280,24 +409,31 @@ def main() -> None: f"Keys={active_keys!r}. Sample dump users: {sample or '(none)'}" ) - users_with_convs = { - uid: u for uid, u in users_to_write.items() if conv_by_user[uid] + def _user_has_exportable_content(uid: str) -> bool: + return bool(conv_by_user[uid] or stories_by_user[uid] or chap_by_user[uid]) + + users_with_content = { + uid: u for uid, u in users_to_write.items() if _user_has_exportable_content(uid) } - skipped_no_conv = len(users_to_write) - len(users_with_convs) - if skipped_no_conv: - print(f"Skipped {skipped_no_conv} user(s) with no conversations") + skipped_empty = len(users_to_write) - len(users_with_content) + if skipped_empty: + print( + f"Skipped {skipped_empty} user(s) with no conversations, stories, or chapters" + ) OUT_DIR.mkdir(parents=True, exist_ok=True) if active_keys: print( f"Filter: {len(active_keys)} key(s) -> {len(users_to_write)} user(s) " - f"-> {len(users_with_convs)} with conversations" + f"-> {len(users_with_content)} with exportable content" ) else: - print(f"Export all: {len(users_with_convs)} user(s) with conversations") + print(f"Export all: {len(users_with_content)} user(s) with exportable content") - for uid, u in sorted(users_with_convs.items(), key=lambda x: x[1].get("phone", "")): + for uid, u in sorted( + users_with_content.items(), key=lambda x: x[1].get("phone", "") + ): label = u["nickname"] or u["phone"] or uid[:8] fname = f"{safe_filename(label)}_{uid}.md" path = OUT_DIR / fname @@ -316,36 +452,107 @@ def main() -> None: lines_out.append("") user_convs = sorted(conv_by_user[uid], key=lambda c: c["started_at"]) - convs_with_segs = [cv for cv in user_convs if seg_by_conv[cv["id"]]] - if not convs_with_segs: - lines_out.append( - "(无对话轮次:dump 里这些会话下没有 segments,或尚未落库)" - ) + if not user_convs: + lines_out.append("(dump 中该用户无 conversations)") lines_out.append("") else: - for cv in convs_with_segs: - segs = sorted(seg_by_conv[cv["id"]], key=lambda s: s["created_at"]) - lines_out.append(f"### 会话 `{cv['id']}`") + convs_with_segs = [cv for cv in user_convs if seg_by_conv[cv["id"]]] + if not convs_with_segs: + lines_out.append("(有会话但无 segments 轮次,或尚未落库)") lines_out.append("") - lines_out.append(f"- 开始: {cv['started_at']}") - if cv.get("conversation_stage"): - lines_out.append(f"- 阶段: {cv['conversation_stage']}") + else: + for cv in convs_with_segs: + segs = sorted(seg_by_conv[cv["id"]], key=lambda s: s["created_at"]) + lines_out.append(f"### 会话 `{cv['id']}`") + lines_out.append("") + lines_out.append(f"- 开始: {cv['started_at']}") + if cv.get("conversation_stage"): + lines_out.append(f"- 阶段: {cv['conversation_stage']}") + lines_out.append("") + + for i, seg in enumerate(segs, 1): + lines_out.append(f"#### 轮次 {i} — {seg['created_at']}") + lines_out.append("") + if seg.get("audio_url"): + lines_out.append(f"- **音频:** `{seg['audio_url']}`") + lines_out.append("") + lines_out.append("**用户:**") + lines_out.append("") + lines_out.append(seg["transcript_text"] or "(空)") + lines_out.append("") + lines_out.append("**AI:**") + lines_out.append("") + lines_out.append(seg["agent_response"] or "(无回复)") + lines_out.append("") + lines_out.append("---") + lines_out.append("") + + lines_out.append("## 人生故事(stories)") + lines_out.append("") + + user_stories = sorted( + stories_by_user[uid], + key=lambda s: (s["updated_at"], s["created_at"], s["title"]), + ) + if not user_stories: + lines_out.append("(无 story 行;表未进 dump 或用户暂无数据)") + lines_out.append("") + else: + for st in user_stories: + lines_out.append(f"### {st['title']}") + lines_out.append("") + meta_bits = [ + f"`{st['id']}`", + f"status={st['status'] or '—'}", + ] + if st.get("stage"): + meta_bits.append(f"stage={st['stage']}") + if st.get("story_type"): + meta_bits.append(f"type={st['story_type']}") + lines_out.append("- " + " | ".join(meta_bits)) + if st.get("time_start") or st.get("time_end"): + lines_out.append( + f"- **时间:** {st['time_start'] or '—'} → {st['time_end'] or '—'}" + ) + if st.get("summary"): + lines_out.append("") + lines_out.append("**摘要:**") + lines_out.append("") + lines_out.append(st["summary"]) + if st.get("people_refs") or st.get("place_refs") or st.get("tag_refs"): + lines_out.append("") + if st.get("people_refs"): + lines_out.append(f"- people_refs: `{st['people_refs'][:200]}`") + if st.get("place_refs"): + lines_out.append(f"- place_refs: `{st['place_refs'][:200]}`") + if st.get("tag_refs"): + lines_out.append(f"- tag_refs: `{st['tag_refs'][:200]}`") + lines_out.append("") + lines_out.append("**当前正文(canonical_markdown):**") + lines_out.append("") + lines_out.append(st["canonical_markdown"] or "(空)") lines_out.append("") - for i, seg in enumerate(segs, 1): - lines_out.append(f"#### 轮次 {i} — {seg['created_at']}") - lines_out.append("") - if seg.get("audio_url"): - lines_out.append(f"- **音频:** `{seg['audio_url']}`") - lines_out.append("") - lines_out.append("**用户:**") - lines_out.append("") - lines_out.append(seg["transcript_text"] or "(空)") - lines_out.append("") - lines_out.append("**AI:**") - lines_out.append("") - lines_out.append(seg["agent_response"] or "(无回复)") + vers = sorted( + versions_by_story[st["id"]], key=lambda v: v["version_no"] + ) + if vers: + lines_out.append("#### 版本历史(story_versions)") lines_out.append("") + for v in vers: + lines_out.append( + f"- **v{v['version_no']}** `{v['id']}` @ {v['created_at']}" + + ( + f" — {v['actor_type']}/{v['source_type']}" + if v["actor_type"] or v["source_type"] + else "" + ) + ) + if v.get("change_summary"): + lines_out.append(f" - 变更说明: {v['change_summary']}") + lines_out.append("") + lines_out.append(v["markdown_snapshot"] or "(空快照)") + lines_out.append("") lines_out.append("---") lines_out.append("") @@ -357,6 +564,7 @@ def main() -> None: ) if not user_chapters: lines_out.append("(无章节)") + lines_out.append("") else: for ch in user_chapters: lines_out.append(f"### [{ch['order_index']}] {ch['title']}") @@ -373,7 +581,7 @@ def main() -> None: path.write_text("\n".join(lines_out), encoding="utf-8") print(f"Wrote {path}") - print(f"Done. {len(users_with_convs)} users -> {OUT_DIR} (source: {sql_path})") + print(f"Done. {len(users_with_content)} users -> {OUT_DIR} (source: {sql_path})") if __name__ == "__main__": diff --git a/api/tests/evaluation/test_gating_service.py b/api/tests/evaluation/test_gating_service.py new file mode 100644 index 0000000..21a0fc7 --- /dev/null +++ b/api/tests/evaluation/test_gating_service.py @@ -0,0 +1,54 @@ +from unittest.mock import MagicMock + +from app.features.evaluation.gating_service import compute_gate + + +def _case(cid: str, protected: bool = False): + c = MagicMock() + c.id = cid + c.title = None + c.is_protected = protected + return c + + +def _run(case_id: str, side: str, composite: float, status: str = "completed"): + r = MagicMock() + r.case_id = case_id + r.side = side + r.status = status + r.composite_score = composite + return r + + +def test_gate_passes_when_mean_up_and_no_protected_regression() -> None: + cases = [_case("1"), _case("2")] + runs = [ + _run("1", "baseline", 50), + _run("1", "candidate", 60), + _run("2", "baseline", 40), + _run("2", "candidate", 55), + ] + g = compute_gate(cases=cases, runs=runs, regression_threshold=2.0) + assert g.passed + assert g.mean_delta > 0 + + +def test_gate_fails_on_protected_regression() -> None: + cases = [_case("1", protected=True)] + runs = [ + _run("1", "baseline", 80.0), + _run("1", "candidate", 75.0), + ] + g = compute_gate(cases=cases, runs=runs, regression_threshold=2.0) + assert not g.passed + assert len(g.protected_regressions) == 1 + + +def test_gate_fails_when_mean_not_higher() -> None: + cases = [_case("1")] + runs = [ + _run("1", "baseline", 70.0), + _run("1", "candidate", 69.0), + ] + g = compute_gate(cases=cases, runs=runs, regression_threshold=2.0) + assert not g.passed diff --git a/api/tests/evaluation/test_importers.py b/api/tests/evaluation/test_importers.py new file mode 100644 index 0000000..2c0bb13 --- /dev/null +++ b/api/tests/evaluation/test_importers.py @@ -0,0 +1,29 @@ +from app.features.evaluation.importers.script_json import parse_script_json +from app.features.evaluation.importers.user_export_markdown import ( + extract_user_utterances_from_export_md, +) + + +def test_parse_script_json_list() -> None: + u, meta = parse_script_json('["a", "b"]') + assert u == ["a", "b"] + assert meta == {} + + +def test_parse_script_json_object() -> None: + u, meta = parse_script_json('{"utterances":["x"],"foo":1}') + assert u == ["x"] + assert meta == {"foo": 1} + + +def test_extract_user_lines_from_export_md() -> None: + md = """ +**用户:** + +hello + +**AI:** + +hi +""" + assert extract_user_utterances_from_export_md(md) == ["hello"] diff --git a/api/tests/evaluation/test_internal_router_auth.py b/api/tests/evaluation/test_internal_router_auth.py new file mode 100644 index 0000000..1066afb --- /dev/null +++ b/api/tests/evaluation/test_internal_router_auth.py @@ -0,0 +1,65 @@ +"""内部路由在未配密钥时应 503。""" + +import pytest +from httpx import ASGITransport, AsyncClient + +from app.features.evaluation.internal_auth import get_internal_eval_principal +from app.features.evaluation.router import router + + +@pytest.mark.asyncio +async def test_internal_eval_list_sets_requires_config(monkeypatch: pytest.MonkeyPatch): + from fastapi import FastAPI + + monkeypatch.setattr( + "app.core.config.settings.internal_eval_api_key", + "", + raising=False, + ) + app = FastAPI() + app.include_router(router, prefix="/internal/api/evaluation") + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://t") as client: + r = await client.get("/internal/api/evaluation/regression-sets") + assert r.status_code == 503 + + +@pytest.mark.asyncio +async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.MonkeyPatch): + from fastapi import FastAPI + + monkeypatch.setattr( + "app.core.config.settings.internal_eval_api_key", + "secret", + raising=False, + ) + app = FastAPI() + app.include_router(router, prefix="/internal/api/evaluation") + + async def _override_auth(): + from app.features.evaluation.internal_auth import InternalEvalPrincipal + + return InternalEvalPrincipal() + + app.dependency_overrides[get_internal_eval_principal] = _override_auth + from app.core.db import get_async_db + from unittest.mock import AsyncMock, MagicMock + + mock_session = AsyncMock() + mock_result = MagicMock() + mock_result.scalars.return_value.unique.return_value.all.return_value = [] + mock_session.execute = AsyncMock(return_value=mock_result) + + async def _db(): + yield mock_session + + app.dependency_overrides[get_async_db] = _db + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://t") as client: + r = await client.get( + "/internal/api/evaluation/regression-sets", + headers={"X-Internal-Eval-Key": "secret"}, + ) + assert r.status_code == 200 + assert r.json() == [] diff --git a/api/tests/user_exports/13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md b/api/tests/user_exports/13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md index 56df3d6..5b53146 100644 --- a/api/tests/user_exports/13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md +++ b/api/tests/user_exports/13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md @@ -322,6 +322,10 @@ --- +## 人生故事(stories) + +(无 story 行;表未进 dump 或用户暂无数据) + ## 回忆录章节(生成正文) ### [0] 在平凡小镇的晨光中醒来 diff --git a/api/tests/user_exports/15366015180_1e0872ce-76e3-4d87-bd96-6cb5e5db7a86.md b/api/tests/user_exports/15366015180_1e0872ce-76e3-4d87-bd96-6cb5e5db7a86.md index ca5464c..fb9430b 100644 --- a/api/tests/user_exports/15366015180_1e0872ce-76e3-4d87-bd96-6cb5e5db7a86.md +++ b/api/tests/user_exports/15366015180_1e0872ce-76e3-4d87-bd96-6cb5e5db7a86.md @@ -7,8 +7,626 @@ ## 对话记录(用户 + AI) -(无对话轮次:dump 里这些会话下没有 segments,或尚未落库) +### 会话 `385bdc9c-81dd-41a5-8b53-93f1d07fda35` + +- 开始: 2026-04-02 03:37:29.056589+00 + +#### 轮次 1 — 2026-04-02 03:37:46.173058+00 + + +**用户:** + +1970年出生的 + +**AI:** + +哇,1970年啊,那会儿我还没来到这个世界呢!感觉你的人生经历一定很丰富。[SPLIT]你是在哪个城市出生的呀? + +#### 轮次 2 — 2026-04-02 03:38:00.02074+00 + + +**用户:** + +在上海出生 + +**AI:** + +上海啊,那可是个充满魅力的城市!我特别喜欢外滩的夜景。[SPLIT]那你是在上海长大的吗? + +#### 轮次 3 — 2026-04-02 03:38:11.697934+00 + + +**用户:** + +是的 一直在上海长大 + +**AI:** + +真好,能在一个城市从出生到长大,一定有很多特别的回忆吧。[SPLIT]你现在是做什么工作的呢? + +#### 轮次 4 — 2026-04-02 03:38:49.410149+00 + + +**用户:** + +现在已经退休了 之前是部队里的师长 驻守南京军区 + +**AI:** + +了解了!那我们现在开始聊聊你的人生故事吧。[SPLIT]作为一位师长,您还记得第一次穿上军装时的感受吗? + +#### 轮次 5 — 2026-04-02 06:14:28.464043+00 + + +**用户:** + +觉得无比的光荣 + +**AI:** + +那份光荣感,一定支撑着您走过了很多年吧。[SPLIT]在您看来,军旅生涯给您的人生带来了哪些最核心的信念? + +#### 轮次 6 — 2026-04-02 08:14:35.685025+00 + + +**用户:** + +团结 守时 记录 + +**AI:** + +团结、守时、记录,这三个词确实很有分量,也很有军人的风格。[SPLIT]在您退休后的生活里,这些信念还以怎样的方式影响着您呢? + +#### 轮次 7 — 2026-04-02 08:18:05.833116+00 + + +**用户:** + +我每天按时6点起床7点睡觉 + +**AI:** + +这份自律真让人敬佩,退休后还能保持军人的作息。[SPLIT]除了作息,您觉得“记录”这个习惯,对您现在的生活有什么特别的意义吗? + +#### 轮次 8 — 2026-04-02 08:26:55.545773+00 + + +**用户:** + +有啊 我要回忆我的一生 给孩子们看 + +**AI:** + +这个想法真好,把一生的故事记录下来,是留给孩子们最珍贵的礼物。[SPLIT]在整理回忆的时候,有没有哪段时光让您觉得特别值得记录,或者特别想先写下来的? + +#### 轮次 9 — 2026-04-02 08:28:18.219196+00 + + +**用户:** + +有的 那是当团长的时候 30岁 我们还在海南守边疆 王伟是我的战友 + +**AI:** + +30岁就当上团长,还在海南守边疆,那段经历一定很深刻。[SPLIT]您和王伟战友之间,有没有什么让您印象特别深的故事? + +#### 轮次 10 — 2026-04-02 08:29:57.422663+00 + + +**用户:** + +他驱赶美国战机, 殉国了。我们大伙前一天晚上还在聚餐 + +**AI:** + +听到这里,心里很沉重。前一天还在把酒言欢,第二天就天人永隔,这种失去战友的痛,一定刻骨铭心。[SPLIT]那次聚餐,你们聊了些什么,或者有什么特别的场景,让您至今还记得? + +#### 轮次 11 — 2026-04-02 08:30:25.959308+00 + + +**用户:** + +我们聊了我要去南京了 成家 结婚生子 + +**AI:** + +原来那次聚餐,聊的是人生新阶段的开始啊。一边是您即将开启的家庭生活,一边是他第二天就要执行的使命,这种对比太让人感慨了。[SPLIT]后来您去南京成家,组建家庭的过程顺利吗? + +#### 轮次 12 — 2026-04-02 08:31:21.876818+00 + + +**用户:** + +顺利 我晋升旅长 部队把我调到南京军区 三个月后就结婚了 2年后有了女儿 + +**AI:** + +恭喜您!那段时间真是双喜临门,事业晋升,家庭也圆满了。[SPLIT]从海南到南京,从守边疆到组建家庭,这个转变过程中,您自己心态上最大的变化是什么? + +#### 轮次 13 — 2026-04-02 08:31:44.382747+00 + + +**用户:** + +觉得 小家好才能大家好 国家好 + +**AI:** + +“小家好才能大家好,国家好”,这句话说得真好,朴实又深刻。[SPLIT]在您看来,为了这个“好”,一个家庭里最重要的东西是什么? + +#### 轮次 14 — 2026-04-02 08:31:55.189561+00 + + +**用户:** + +和睦 + +**AI:** + +和睦确实是家庭的基石。家和万事兴,老话总是有道理的。[SPLIT]在您家里,有没有什么特别的小传统或者小习惯,是您觉得最能体现“和睦”这两个字的? + +#### 轮次 15 — 2026-04-02 08:32:40.178961+00 + + +**用户:** + +我和妻子孩子从不为小事生气,原则性问题当天说清楚 绝不带着脾气过夜 + +**AI:** + +这个习惯真好,当天事当天了,不让矛盾过夜,家里自然就和睦了。[SPLIT]您觉得,在部队里养成的哪些习惯,对您处理家庭关系特别有帮助? + +#### 轮次 16 — 2026-04-02 08:33:03.265941+00 + + +**用户:** + +沟通 及时沟通 充分沟通 + +**AI:** + +沟通确实是解决一切问题的桥梁,无论是在部队还是在家里。[SPLIT]退休后,和家人的沟通方式,和以前在部队时相比,有什么不一样的感觉吗? + +#### 轮次 17 — 2026-04-02 08:33:47.564114+00 + + +**用户:** + +有 和家人不能生气 要有耐心 不像军队里和部下沟通 孩子小的时候尤其重要 + +**AI:** + +说得对,对家人确实需要更多的耐心和包容,这和带兵是两种不同的责任。[SPLIT]在女儿小的时候,您是怎么把“耐心”这一点落实到具体事情上的?有没有印象特别深的一件事? + +#### 轮次 18 — 2026-04-02 09:23:35.868764+00 + + +**用户:** + +她数学题做不出来 我们都非常耐心的教她 + +**AI:** + +教孩子功课确实很考验耐心,尤其是数学题。您和妻子一起教,这份陪伴本身就是最好的教育了。[SPLIT]现在回想起来,女儿成长过程中,您觉得作为父亲,自己做得最好的一点是什么? + +#### 轮次 19 — 2026-04-02 09:53:23.603863+00 + + +**用户:** + +她需要我的时候 我一直都在 每次都接住了她的情绪 + +**AI:** + +“每次都接住了她的情绪”,这句话真温暖。对孩子来说,父亲稳稳的在场和接纳,比什么都重要。[SPLIT]现在女儿也长大了吧?您觉得她身上有哪些品质,是您最欣赏的? + +#### 轮次 20 — 2026-04-02 09:53:43.037142+00 + + +**用户:** + +勇敢无畏独立自强 + +**AI:** + +勇敢、独立、自强,这些品质真棒,想必也有您言传身教的影响在里头。[SPLIT]看到女儿成长为这样的人,作为父亲,您心里最深的感触是什么? + +#### 轮次 21 — 2026-04-02 09:54:02.076116+00 + + +**用户:** + +我最完美的作品 + +**AI:** + +“最完美的作品”,这是父亲能给出的最高评价了,充满了骄傲和爱。[SPLIT]聊了这么多,从军旅生涯到家庭生活,您觉得这一生走过来,最让您感到踏实和满足的是什么? + +#### 轮次 22 — 2026-04-02 09:54:19.543391+00 + + +**用户:** + +我对得起每一个我认识的人 + +**AI:** + +“对得起每一个认识的人”,这句话的分量很重,能做到这一点,人生就没什么遗憾了。[SPLIT]如果让您用一个词来总结自己这一生,您会选哪个词? + +#### 轮次 23 — 2026-04-02 09:55:08.580883+00 + + +**用户:** + +安稳 + +**AI:** + +安稳,一个很踏实、很有分量的词。能走过风雨,最终归于安稳,是福气。[SPLIT]聊了这么多,您觉得在整理这些回忆的时候,自己最大的收获是什么? + +#### 轮次 24 — 2026-04-02 09:55:29.270914+00 + + +**用户:** + +我回忆去了许多快忘的细节 + +**AI:** + +是啊,回忆就像打开一个老箱子,拂去灰尘,里面都是闪着光的宝贝。[SPLIT]这些被重新拾起的细节里,有没有哪一件小事,是让您想起来就忍不住微笑的? + +#### 轮次 25 — 2026-04-03 02:23:21.396195+00 + + +**用户:** + +是的 小时候经常去梧桐树下散步 + +**AI:** + +上海的梧桐树,夏天遮阳,秋天落叶,走在下面特别有味道。那是属于一个时代的安稳和宁静。[SPLIT]除了散步,小时候在梧桐树下,还发生过什么让您印象深刻的事吗? + +--- + +### 会话 `5f2f061f-2af7-428f-8664-7a44e566de82` + +- 开始: 2026-04-03 05:36:47.25577+00 + +#### 轮次 1 — 2026-04-03 05:37:12.543033+00 + + +**用户:** + +王大伟 我们小时候经常去初中门口的梧桐树下假装在法国战斗 + +**AI:** + +那时候的梧桐树荫,就是我们的战场。你之前提过小时候常去散步,没想到还演过这样的戏。 + +#### 轮次 2 — 2026-04-03 05:37:49.886204+00 + + +**用户:** + +是呀 小时候喜欢的事 长大变成职业了 + +**AI:** + +从梧桐树下的游戏,到真正的战场,这条路走得不容易。 + +#### 轮次 3 — 2026-04-03 05:38:26.617923+00 + + +**用户:** + +我们小时候中午午休时经常全班一起捉迷藏 + +**AI:** + +午休时的集体游戏,现在想来都是珍贵的默契训练。 + +#### 轮次 4 — 2026-04-03 05:43:15.848813+00 + + +**用户:** + +是的 一玩就是1个小时 下午上课都犯困 + +**AI:** + +那时候的快乐简单又纯粹。后来在部队里,午休时间是不是就珍贵多了? + +#### 轮次 5 — 2026-04-03 05:43:35.125618+00 + + +**用户:** + +是啊 部队里训练很幸苦 美好时间午休 + +**AI:** + +那段时光确实不容易。从上海到海南,环境和节奏变化很大吧? + +#### 轮次 6 — 2026-04-03 05:48:31.570829+00 + + +**用户:** + +变化很大 最难接受的就是海南的台风暴雨 + +**AI:** + +台风暴雨确实考验人。在那种环境下,你和战友们是怎么互相照应的? + +#### 轮次 7 — 2026-04-03 05:50:37.06702+00 + + +**用户:** + +改造排水 搭雨棚 补天花板 + +**AI:** + +这些事听着就辛苦,但也是难得的战友情。 + +#### 轮次 8 — 2026-04-03 06:06:32.744349+00 + + +**用户:** + +是啊 我们过年还经常一起包饺子 + +**AI:** + +一起包饺子,是部队里难得的温馨时刻。 + +--- + +## 人生故事(stories) + +### 退休后的师长回忆 + +- `ca460de1-6279-4fe6-9221-5a77f5267b5c` | status=active | stage=career_achievement + +**当前正文(canonical_markdown):** + +如今我已退休,回想起来,此前在南京军区担任部队师长的岁月,是我人生中一段重要的经历。 + +#### 版本历史(story_versions) + +- **v1** `b5cccbf6-e901-4934-a990-e9ddd4cec818` @ 2026-04-02 03:40:27.32774+00 — ai/generate + +如今我已退休,回想起来,此前在南京军区担任部队师长的岁月,是我人生中一段重要的经历。 + +--- + +### 人生总结 + +- `5ba19c07-3c74-4081-848f-64c02967d23f` | status=active | stage=summary + +**当前正文(canonical_markdown):** + +回想起来,心中依然觉得无比光荣。 + +#### 版本历史(story_versions) + +- **v1** `8ce3422a-1aea-486d-8387-9c4da9361acd` @ 2026-04-02 06:15:49.677107+00 — ai/generate + +回想起来,心中依然觉得无比光荣。 + +--- + +### 年过半百 · 军旅生涯中的团结守时与家庭和睦 + +- `b94a3e77-1976-4783-bee3-b6bd23a5bd77` | status=active | stage=summary + +**当前正文(canonical_markdown):** + +团结、守时、记录,这三点是我多年来始终秉持的原则。回想起来,每日的生活也遵循着严格的规律,总是清晨六点准时起身,夜晚七点便安然就寝。这样的作息,让日子过得安稳而和睦。 + +#### 版本历史(story_versions) + +- **v1** `5f8cbc37-23ab-41bc-b34b-12457fb49f97` @ 2026-04-02 08:33:37.066232+00 — ai/generate + +团结、守时、记录,这三点是我多年来始终秉持的原则。回想起来,每日的生活也遵循着严格的规律,总是清晨六点准时起身,夜晚七点便安然就寝。这样的作息,让日子过得安稳而和睦。 + +--- + +### 家庭与情感 + +- `738f54ba-ea27-4dbd-a910-9e90c68ed594` | status=active | stage=family + +**当前正文(canonical_markdown):** + +他驱赶美国战机,为国捐躯了。我们大伙前一天晚上还在聚餐。 + +#### 版本历史(story_versions) + +- **v1** `b848de23-eaaf-4abd-baca-a17de8c10389` @ 2026-04-02 08:33:56.326706+00 — ai/generate + +他驱赶美国战机,为国捐躯了。我们大伙前一天晚上还在聚餐。 + +--- + +### 家庭与情感 + +- `e77a50f8-810a-4872-b97a-15a98be68ea4` | status=active | stage=family + +**当前正文(canonical_markdown):** + +回想起来,那晚聚餐时,我们聊到了我将要前往南京,以及成家、结婚生子的话题。 + +#### 版本历史(story_versions) + +- **v1** `2ae128ac-36a2-4ef0-a32e-4f1a7848dcfb` @ 2026-04-02 08:34:03.539838+00 — ai/generate + +回想起来,那晚聚餐时,我们聊到了我将要前往南京,以及成家、结婚生子的话题。 + +--- + +### 晋升旅长后 · 家庭中的坦诚与原则 + +- `ff2e4524-e292-489a-a8ed-1354a0255590` | status=active | stage=family + +**当前正文(canonical_markdown):** + +回想起来,我与妻子和孩子之间,从不为琐碎小事置气。遇到原则性问题,我们总是当天就坦诚沟通,绝不让隔夜的脾气影响彼此的感情。 + +#### 版本历史(story_versions) + +- **v1** `1cb0b092-b244-4ba2-a755-761e8c823b45` @ 2026-04-02 08:34:14.271499+00 — ai/generate + +回想起来,我与妻子和孩子之间,从不为琐碎小事置气。遇到原则性问题,我们总是当天就坦诚沟通,绝不让隔夜的脾气影响彼此的感情。 + +--- + +### 中年时期 · 与家人相处的耐心之道 + +- `9ebed78c-dd07-405d-b55f-7dcd8e1af5d0` | status=active | stage=family + +**当前正文(canonical_markdown):** + +回想起来,与家人相处时,我始终提醒自己不能生气,要有耐心,这与在军队里和部下沟通的方式截然不同。孩子小的时候,这一点尤其重要。记得她有时做不出数学题,我们都非常耐心地教导她。在她需要我的时候,我一直都在,每次都稳稳地接住了她的情绪。 + +#### 版本历史(story_versions) + +- **v1** `62e8ae27-5567-4c75-a7aa-b6a2957c9d23` @ 2026-04-02 09:55:17.806049+00 — ai/generate + +回想起来,与家人相处时,我始终提醒自己不能生气,要有耐心,这与在军队里和部下沟通的方式截然不同。孩子小的时候,这一点尤其重要。记得她有时做不出数学题,我们都非常耐心地教导她。在她需要我的时候,我一直都在,每次都稳稳地接住了她的情绪。 + +--- + +### 信念与价值观 + +- `43901ce1-2649-4606-b23b-9571e8d3353a` | status=active | stage=beliefs + +**当前正文(canonical_markdown):** + +回想起来,我始终坚信,只有每个小家庭和睦美满,大家才能团结一心,国家也才能繁荣昌盛。 + +#### 版本历史(story_versions) + +- **v1** `758ac6ba-898d-42b6-87cc-fc5f3deb0fa7` @ 2026-04-02 09:55:36.191105+00 — ai/generate + +回想起来,我始终坚信,只有每个小家庭和睦美满,大家才能团结一心,国家也才能繁荣昌盛。 + +--- + +### 信念与价值观 + +- `aade1e47-8123-4008-91ea-4872059c538c` | status=active | stage=beliefs + +**当前正文(canonical_markdown):** + +回想起来,勇敢无畏与独立自强,始终是我秉持的信念。 + +#### 版本历史(story_versions) + +- **v1** `667535a7-1f6b-4b34-ad5f-2709cc1190d3` @ 2026-04-02 09:55:42.584562+00 — ai/generate + +回想起来,勇敢无畏与独立自强,始终是我秉持的信念。 + +--- + +### 信念与价值观 + +- `2dbdb42d-0870-4158-a648-4fdc445e40fd` | status=active | stage=beliefs + +**当前正文(canonical_markdown):** + +回想起来,我始终秉持着一个信念:要对得起每一个认识的人。 + +#### 版本历史(story_versions) + +- **v1** `38a35b4f-9740-49c9-b367-da64ba36b077` @ 2026-04-02 09:55:50.190706+00 — ai/generate + +回想起来,我始终秉持着一个信念:要对得起每一个认识的人。 + +--- + +### 上海出生与成长 + +- `7df145a7-888c-44aa-8e7e-a1da654044f2` | status=active | stage=childhood +- **时间:** 1970 → — + +**当前正文(canonical_markdown):** + +我于1970年出生于上海,并一直在上海长大。 + +回想童年时光,我经常去初中门口的梧桐树下散步。那段日子里,我和王大伟常在梧桐树下,玩着假装在法国战斗的游戏。 + +如今想来,小时候喜欢的这些事,后来竟与我的职业产生了某种关联。 + +#### 版本历史(story_versions) + +- **v1** `671840f4-49f4-42a2-91bc-c39d140e65cd` @ 2026-04-02 03:39:31.652715+00 — ai/generate + +是的,一直在上海长大,在上海出生。 + +- **v2** `3df1ab74-1d17-4fed-ac48-78bc8ab3212c` @ 2026-04-03 05:38:53.311154+00 — ai/generate + +我于1970年出生于上海,并一直在上海长大。 + +回想童年时光,我经常去初中门口的梧桐树下散步。那段日子里,我和王大伟常在梧桐树下,玩着假装在法国战斗的游戏。 + +如今想来,小时候喜欢的这些事,后来竟与我的职业产生了某种关联。 + +--- + +### 童年与成长背景 + +- `b8da3d0d-641d-4b52-9ea0-a35b20eb6cde` | status=active | stage=childhood + +**当前正文(canonical_markdown):** + +是呀,小时候喜欢的事,长大变成职业了。 + +#### 版本历史(story_versions) + +- **v1** `03093b70-0044-43a6-951d-43dffdaa3017` @ 2026-04-03 05:38:57.087838+00 — ai/generate + +是呀,小时候喜欢的事,长大变成职业了。 + +--- ## 回忆录章节(生成正文) -(无章节) \ No newline at end of file +### [0] \N + +- **ID:** `f3c68909-d793-4fd1-8539-a5681d7eb219` | **状态:** 2026-04-02 08:34:14.404814+00 | **分类:** ["48e5b093-30b7-40d4-9aa9-73d068bed10d", "4555a167-f964-4461-b107-6b890ef1afdc", "7cc46d54-cd24-497c-a260-64fc55a36274", "4aad164d-ec32-46ba-b01d-347fe4bc5bcc"] | **updated:** t + +人生总结 summary 7 \N completed \N 2a7594b1-1f6c-4ada-89ed-38ee695503c1 + +--- + +### [0] \N + +- **ID:** `dc866784-c562-4d1e-b01e-85e2ede5bc60` | **状态:** 2026-04-02 03:40:35.387935+00 | **分类:** ["13a5217f-f0b1-4eee-a7f3-c9cb2929854c"] | **updated:** t + +军旅生涯 · 师长之路 career_achievement 3 \N completed \N a33e6d94-cf23-4eab-bede-3f71d4c0c568 + +--- + +### [0] \N + +- **ID:** `0bb0451b-b5c0-4d98-a532-c98a403f2dfc` | **状态:** 2026-04-02 09:55:50.326728+00 | **分类:** ["f1ba25f1-bb28-477b-8c5f-ae2de9d532e2", "9d5690d4-185b-49e7-9345-434b091f0fc3", "b4f977b5-4f2e-4fab-87c5-f1a84bb81bf6", "3b00142d-3a70-434b-85cd-a4ce02734c48", "d23e22eb-7255-4461-a9b8-8f7af229a2f2", "a4abd031-8062-4385-9d05-a93ef0394bed"] | **updated:** t + +家庭与情感 family 5 \N completed \N e37cba9c-3559-4dab-b460-b28483b2ca97 + +--- + +### [0] \N + +- **ID:** `aceb186f-06f7-465d-8d01-2231a0c6fe5b` | **状态:** 2026-04-02 09:55:58.269606+00 | **分类:** ["dd9e81c7-ff4c-4014-9471-9b67a0cb03c5", "0cb3b708-9554-48c1-a3a8-91c0a69db30f", "5eebf1c3-6807-4c22-b4f1-0330649b39e1"] | **updated:** t + +信念与价值观 beliefs 6 \N completed \N 49662197-9ba5-46ba-b104-be6338dc92d2 + +--- + +### [0] \N + +- **ID:** `de8f956a-9681-4076-bd9e-c543bce4e500` | **状态:** 2026-04-03 05:39:05.188888+00 | **分类:** ["29ab6b25-cdc9-4ec1-9771-31a3d6b9964c", "f35b94a2-1144-48a4-bfac-ec6727594952", "dde5cdfd-eb77-46a8-8bee-a18c101745e8", "f1b6328b-bd18-4462-836d-5eef939c1c5a", "2ef2a51e-f866-4d28-bfe9-968ed6a4d4b2"] | **updated:** t + +童年时光 · 上海弄堂里的寻常日子 childhood 0 \N completed \N ef2e94f6-252a-41d1-a204-47a720d0a474 + +--- diff --git a/api/tests/user_exports/nije_5e51151a-cb46-4c5c-ad5d-dae9c58ca243.md b/api/tests/user_exports/nije_5e51151a-cb46-4c5c-ad5d-dae9c58ca243.md index 176c77a..a3e40af 100644 --- a/api/tests/user_exports/nije_5e51151a-cb46-4c5c-ad5d-dae9c58ca243.md +++ b/api/tests/user_exports/nije_5e51151a-cb46-4c5c-ad5d-dae9c58ca243.md @@ -772,6 +772,10 @@ AI领域现在发展得真快,能在这个领域工作一定很有挑战也很 --- +## 人生故事(stories) + +(无 story 行;表未进 dump 或用户暂无数据) + ## 回忆录章节(生成正文) ### [0] 童年时光里那些安静的清晨 diff --git a/api/tests/user_exports/zuckxu_1ade609c-567a-450b-b8fb-776aaba3c2b3.md b/api/tests/user_exports/zuckxu_1ade609c-567a-450b-b8fb-776aaba3c2b3.md index 9ae13dc..dbe7102 100644 --- a/api/tests/user_exports/zuckxu_1ade609c-567a-450b-b8fb-776aaba3c2b3.md +++ b/api/tests/user_exports/zuckxu_1ade609c-567a-450b-b8fb-776aaba3c2b3.md @@ -1366,6 +1366,10 @@ weifang --- +## 人生故事(stories) + +(无 story 行;表未进 dump 或用户暂无数据) + ## 回忆录章节(生成正文) ### [0] 童年时光的静默拼图 diff --git a/app-eval-web/README.md b/app-eval-web/README.md new file mode 100644 index 0000000..8642dcc --- /dev/null +++ b/app-eval-web/README.md @@ -0,0 +1,25 @@ +# 内部评测 Web(Life Echo) + +独立 Vite + React 控制台,对接 `app.internal_main:internal_app`。 + +## 环境变量 + +- `VITE_EVAL_API_BASE` — 内部 API 根,如 `http://127.0.0.1:8001` +- `VITE_EVAL_API_KEY` — 与后端 `INTERNAL_EVAL_API_KEY` 相同 + +## 开发 + +```bash +npm install +VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=your-secret npm run dev +``` + +浏览器打开提示的端口(默认 5174)。 + +## 构建 + +```bash +npm run build +``` + +产物在 `dist/`,可挂任意静态服务器。SSE 使用 `?key=` 传评测密钥(见后端文档)。 diff --git a/app-eval-web/index.html b/app-eval-web/index.html new file mode 100644 index 0000000..d8a8666 --- /dev/null +++ b/app-eval-web/index.html @@ -0,0 +1,12 @@ + + +
+ + +
+ API: {apiBase} · 配置{" "}
+ VITE_EVAL_API_BASE / VITE_EVAL_API_KEY
+
{msg}
: null} + + + + {tab === "sessions" ? ( +| 会话 ID | +用户 | +阶段 | ++ |
|---|---|---|---|
+ {s.id.slice(0, 8)}…
+ |
+ {s.user_id.slice(0, 8)}… | +{s.conversation_stage ?? "—"} | ++ + | +
+ 回归集 ID: + +
+{s.id} — {s.name}{" "}
+
+
+ {JSON.stringify(expDetail, null, 2)}
+
+ {streamLog.join("\n")}
+ ) : null}
+