数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。 业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。 内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。 app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。 工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
359 lines
9.9 KiB
Python
359 lines
9.9 KiB
Python
"""评测域数据访问。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from typing import Any
|
|
|
|
from sqlalchemy import select
|
|
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.features.evaluation.models import (
|
|
EvalCase,
|
|
EvalExperiment,
|
|
EvalGateVerdict,
|
|
EvalRegressionSet,
|
|
EvalRun,
|
|
EvalRunTurn,
|
|
EvalVersion,
|
|
)
|
|
|
|
|
|
def _id() -> str:
|
|
return str(uuid.uuid4()).replace("-", "")[:32]
|
|
|
|
|
|
async def list_regression_sets(db: AsyncSession) -> list[EvalRegressionSet]:
|
|
res = await db.execute(
|
|
select(EvalRegressionSet).order_by(EvalRegressionSet.created_at.desc())
|
|
)
|
|
return list(res.scalars().unique().all())
|
|
|
|
|
|
async def get_regression_set(db: AsyncSession, sid: str) -> EvalRegressionSet | None:
|
|
return await db.get(EvalRegressionSet, sid)
|
|
|
|
|
|
async def create_regression_set(
|
|
db: AsyncSession, *, name: str, description: str | None = None
|
|
) -> EvalRegressionSet:
|
|
row = EvalRegressionSet(id=_id(), name=name, description=description)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def list_cases(db: AsyncSession, regression_set_id: str) -> list[EvalCase]:
|
|
res = await db.execute(
|
|
select(EvalCase)
|
|
.where(EvalCase.regression_set_id == regression_set_id)
|
|
.order_by(EvalCase.created_at.asc())
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def create_case(
|
|
db: AsyncSession,
|
|
*,
|
|
regression_set_id: str,
|
|
user_utterances: list[str],
|
|
title: str | None = None,
|
|
source_conversation_id: str | None = None,
|
|
source_user_id: str | None = None,
|
|
reference_memoir_markdown: str | None = None,
|
|
is_protected: bool = False,
|
|
meta: dict[str, Any] | None = None,
|
|
) -> EvalCase:
|
|
row = EvalCase(
|
|
id=_id(),
|
|
regression_set_id=regression_set_id,
|
|
source_conversation_id=source_conversation_id,
|
|
source_user_id=source_user_id,
|
|
title=title,
|
|
user_utterances=list(user_utterances),
|
|
reference_memoir_markdown=reference_memoir_markdown,
|
|
is_protected=is_protected,
|
|
meta=meta,
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_case(db: AsyncSession, case_id: str) -> EvalCase | None:
|
|
return await db.get(EvalCase, case_id)
|
|
|
|
|
|
async def list_versions(db: AsyncSession) -> list[EvalVersion]:
|
|
res = await db.execute(select(EvalVersion).order_by(EvalVersion.created_at.desc()))
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def create_version(
|
|
db: AsyncSession,
|
|
*,
|
|
name: str,
|
|
runner_kind: str = "llm_chat_v1",
|
|
config_json: dict[str, Any] | None = None,
|
|
) -> EvalVersion:
|
|
row = EvalVersion(
|
|
id=_id(),
|
|
name=name,
|
|
runner_kind=runner_kind,
|
|
config_json=config_json,
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_version(db: AsyncSession, vid: str) -> EvalVersion | None:
|
|
return await db.get(EvalVersion, vid)
|
|
|
|
|
|
async def create_experiment(
|
|
db: AsyncSession,
|
|
*,
|
|
name: str,
|
|
regression_set_id: str,
|
|
baseline_version_id: str,
|
|
candidate_version_id: str,
|
|
rubric_pack: str = "conversation_v1+memoir_v1",
|
|
composite_weights_json: dict[str, Any] | None = None,
|
|
) -> EvalExperiment:
|
|
row = EvalExperiment(
|
|
id=_id(),
|
|
name=name,
|
|
regression_set_id=regression_set_id,
|
|
baseline_version_id=baseline_version_id,
|
|
candidate_version_id=candidate_version_id,
|
|
rubric_pack=rubric_pack,
|
|
composite_weights_json=composite_weights_json,
|
|
status="pending",
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_experiment(db: AsyncSession, eid: str) -> EvalExperiment | None:
|
|
return await db.get(EvalExperiment, eid)
|
|
|
|
|
|
async def list_experiments(db: AsyncSession, limit: int = 50) -> list[EvalExperiment]:
|
|
res = await db.execute(
|
|
select(EvalExperiment).order_by(EvalExperiment.created_at.desc()).limit(limit)
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def update_experiment(
|
|
db: AsyncSession,
|
|
exp: EvalExperiment,
|
|
*,
|
|
status: str | None = None,
|
|
error_message: str | None = None,
|
|
completed_at: Any | None = ...,
|
|
) -> None:
|
|
if status is not None:
|
|
exp.status = status
|
|
if error_message is not None:
|
|
exp.error_message = error_message
|
|
if completed_at is not ...:
|
|
exp.completed_at = completed_at
|
|
|
|
|
|
async def get_run(
|
|
db: AsyncSession, experiment_id: str, case_id: str, side: str
|
|
) -> EvalRun | None:
|
|
res = await db.execute(
|
|
select(EvalRun).where(
|
|
EvalRun.experiment_id == experiment_id,
|
|
EvalRun.case_id == case_id,
|
|
EvalRun.side == side,
|
|
)
|
|
)
|
|
return res.scalars().first()
|
|
|
|
|
|
async def create_run(
|
|
db: AsyncSession,
|
|
*,
|
|
experiment_id: str,
|
|
case_id: str,
|
|
side: str,
|
|
) -> EvalRun:
|
|
row = EvalRun(
|
|
id=_id(),
|
|
experiment_id=experiment_id,
|
|
case_id=case_id,
|
|
side=side,
|
|
status="pending",
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def list_runs_for_experiment(
|
|
db: AsyncSession, experiment_id: str
|
|
) -> list[EvalRun]:
|
|
res = await db.execute(
|
|
select(EvalRun).where(EvalRun.experiment_id == experiment_id)
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def list_runs_for_source_conversation(
|
|
db: AsyncSession,
|
|
*,
|
|
source_conversation_id: str,
|
|
limit: int = 80,
|
|
) -> list[tuple[EvalRun, EvalCase, EvalExperiment]]:
|
|
stmt = (
|
|
select(EvalRun, EvalCase, EvalExperiment)
|
|
.join(EvalCase, EvalRun.case_id == EvalCase.id)
|
|
.join(EvalExperiment, EvalRun.experiment_id == EvalExperiment.id)
|
|
.where(EvalCase.source_conversation_id == source_conversation_id)
|
|
.order_by(
|
|
EvalRun.completed_at.desc().nulls_last(),
|
|
EvalRun.started_at.desc().nulls_last(),
|
|
)
|
|
.limit(limit)
|
|
)
|
|
res = await db.execute(stmt)
|
|
return list(res.all())
|
|
|
|
|
|
async def update_run(
|
|
db: AsyncSession,
|
|
run: EvalRun,
|
|
*,
|
|
status: str | None = None,
|
|
error_message: str | None = None,
|
|
memoir_markdown: str | None = None,
|
|
conversation_score_total: float | None = None,
|
|
memoir_score_total: float | None = None,
|
|
composite_score: float | None = None,
|
|
judge_bundle_json: dict[str, Any] | None = None,
|
|
started_at: Any | None = ...,
|
|
completed_at: Any | None = ...,
|
|
) -> None:
|
|
if status is not None:
|
|
run.status = status
|
|
if error_message is not None:
|
|
run.error_message = error_message
|
|
if memoir_markdown is not None:
|
|
run.memoir_markdown = memoir_markdown
|
|
if conversation_score_total is not None:
|
|
run.conversation_score_total = conversation_score_total
|
|
if memoir_score_total is not None:
|
|
run.memoir_score_total = memoir_score_total
|
|
if composite_score is not None:
|
|
run.composite_score = composite_score
|
|
if judge_bundle_json is not None:
|
|
run.judge_bundle_json = judge_bundle_json
|
|
if started_at is not ...:
|
|
run.started_at = started_at
|
|
if completed_at is not ...:
|
|
run.completed_at = completed_at
|
|
|
|
|
|
async def add_turn(
|
|
db: AsyncSession,
|
|
*,
|
|
run_id: str,
|
|
turn_index: int,
|
|
user_utterance: str,
|
|
assistant_reply: str | None,
|
|
duration_ms: int | None,
|
|
judge_scores_json: dict[str, Any] | None,
|
|
judge_rationale: str | None,
|
|
) -> EvalRunTurn:
|
|
"""插入或更新同 (run_id, turn_index) 的轮次,避免 Celery 重试时 UniqueViolation。"""
|
|
tid = _id()
|
|
ins = pg_insert(EvalRunTurn).values(
|
|
id=tid,
|
|
run_id=run_id,
|
|
turn_index=turn_index,
|
|
user_utterance=user_utterance,
|
|
assistant_reply=assistant_reply,
|
|
duration_ms=duration_ms,
|
|
judge_scores_json=judge_scores_json,
|
|
judge_rationale=judge_rationale,
|
|
)
|
|
stmt = ins.on_conflict_do_update(
|
|
constraint="uq_eval_run_turn_index",
|
|
set_={
|
|
"user_utterance": ins.excluded.user_utterance,
|
|
"assistant_reply": ins.excluded.assistant_reply,
|
|
"duration_ms": ins.excluded.duration_ms,
|
|
"judge_scores_json": ins.excluded.judge_scores_json,
|
|
"judge_rationale": ins.excluded.judge_rationale,
|
|
},
|
|
)
|
|
await db.execute(stmt)
|
|
await db.flush()
|
|
res = await db.execute(
|
|
select(EvalRunTurn)
|
|
.where(
|
|
EvalRunTurn.run_id == run_id,
|
|
EvalRunTurn.turn_index == turn_index,
|
|
)
|
|
.limit(1)
|
|
)
|
|
row = res.scalar_one()
|
|
return row
|
|
|
|
|
|
async def list_turns(db: AsyncSession, run_id: str) -> list[EvalRunTurn]:
|
|
res = await db.execute(
|
|
select(EvalRunTurn)
|
|
.where(EvalRunTurn.run_id == run_id)
|
|
.order_by(EvalRunTurn.turn_index.asc())
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def upsert_gate_verdict(
|
|
db: AsyncSession,
|
|
*,
|
|
experiment_id: str,
|
|
passed: bool,
|
|
mean_composite_delta: float | None,
|
|
protected_regressions_json: list[dict[str, Any]] | None,
|
|
details_json: dict[str, Any] | None,
|
|
) -> EvalGateVerdict:
|
|
res = await db.execute(
|
|
select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id)
|
|
)
|
|
row = res.scalars().first()
|
|
if row:
|
|
row.passed = passed
|
|
row.mean_composite_delta = mean_composite_delta
|
|
row.protected_regressions_json = protected_regressions_json
|
|
row.details_json = details_json
|
|
await db.flush()
|
|
return row
|
|
row = EvalGateVerdict(
|
|
id=_id(),
|
|
experiment_id=experiment_id,
|
|
passed=passed,
|
|
mean_composite_delta=mean_composite_delta,
|
|
protected_regressions_json=protected_regressions_json,
|
|
details_json=details_json,
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_gate_verdict(
|
|
db: AsyncSession, experiment_id: str
|
|
) -> EvalGateVerdict | None:
|
|
res = await db.execute(
|
|
select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id)
|
|
)
|
|
return res.scalars().first()
|