195 lines
6.5 KiB
Python
195 lines
6.5 KiB
Python
|
|
"""ORM:内部回归评测(与生产 conversation 表隔离)。"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
from sqlalchemy import (
|
|||
|
|
JSON,
|
|||
|
|
Boolean,
|
|||
|
|
Column,
|
|||
|
|
DateTime,
|
|||
|
|
Float,
|
|||
|
|
ForeignKey,
|
|||
|
|
Integer,
|
|||
|
|
String,
|
|||
|
|
Text,
|
|||
|
|
UniqueConstraint,
|
|||
|
|
)
|
|||
|
|
from sqlalchemy.orm import relationship
|
|||
|
|
|
|||
|
|
from app.core.db import Base, utc_now
|
|||
|
|
|
|||
|
|
|
|||
|
|
class EvalRegressionSet(Base):
|
|||
|
|
__tablename__ = "eval_regression_sets"
|
|||
|
|
|
|||
|
|
id = Column(String, primary_key=True)
|
|||
|
|
name = Column(String, nullable=False)
|
|||
|
|
description = Column(Text, nullable=True)
|
|||
|
|
created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False)
|
|||
|
|
|
|||
|
|
cases = relationship(
|
|||
|
|
"EvalCase",
|
|||
|
|
back_populates="regression_set",
|
|||
|
|
cascade="all, delete-orphan",
|
|||
|
|
)
|
|||
|
|
experiments = relationship("EvalExperiment", back_populates="regression_set")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class EvalCase(Base):
|
|||
|
|
"""从真实会话快照或导入脚本的不可变用户轮次列表。"""
|
|||
|
|
|
|||
|
|
__tablename__ = "eval_cases"
|
|||
|
|
|
|||
|
|
id = Column(String, primary_key=True)
|
|||
|
|
regression_set_id = Column(
|
|||
|
|
String, ForeignKey("eval_regression_sets.id"), nullable=False
|
|||
|
|
)
|
|||
|
|
source_conversation_id = Column(String, nullable=True, index=True)
|
|||
|
|
source_user_id = Column(String, nullable=True, index=True)
|
|||
|
|
title = Column(String, nullable=True)
|
|||
|
|
user_utterances = Column(JSON, nullable=False)
|
|||
|
|
reference_memoir_markdown = Column(Text, nullable=True)
|
|||
|
|
is_protected = Column(
|
|||
|
|
Boolean, nullable=False, default=False, server_default="false"
|
|||
|
|
)
|
|||
|
|
meta = Column(JSON, nullable=True)
|
|||
|
|
created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False)
|
|||
|
|
|
|||
|
|
regression_set = relationship("EvalRegressionSet", back_populates="cases")
|
|||
|
|
runs = relationship("EvalRun", back_populates="case")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class EvalVersion(Base):
|
|||
|
|
"""基线或候选:回放配置(模型、system 补充等)。"""
|
|||
|
|
|
|||
|
|
__tablename__ = "eval_versions"
|
|||
|
|
|
|||
|
|
id = Column(String, primary_key=True)
|
|||
|
|
name = Column(String, nullable=False)
|
|||
|
|
runner_kind = Column(String, nullable=False, default="llm_chat_v1")
|
|||
|
|
config_json = Column(JSON, nullable=True)
|
|||
|
|
created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False)
|
|||
|
|
|
|||
|
|
experiments_as_baseline = relationship(
|
|||
|
|
"EvalExperiment",
|
|||
|
|
foreign_keys="EvalExperiment.baseline_version_id",
|
|||
|
|
back_populates="baseline_version",
|
|||
|
|
)
|
|||
|
|
experiments_as_candidate = relationship(
|
|||
|
|
"EvalExperiment",
|
|||
|
|
foreign_keys="EvalExperiment.candidate_version_id",
|
|||
|
|
back_populates="candidate_version",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class EvalExperiment(Base):
|
|||
|
|
__tablename__ = "eval_experiments"
|
|||
|
|
|
|||
|
|
id = Column(String, primary_key=True)
|
|||
|
|
name = Column(String, nullable=False)
|
|||
|
|
regression_set_id = Column(
|
|||
|
|
String, ForeignKey("eval_regression_sets.id"), nullable=False
|
|||
|
|
)
|
|||
|
|
baseline_version_id = Column(String, ForeignKey("eval_versions.id"), nullable=False)
|
|||
|
|
candidate_version_id = Column(
|
|||
|
|
String, ForeignKey("eval_versions.id"), nullable=False
|
|||
|
|
)
|
|||
|
|
rubric_pack = Column(String, nullable=False, default="conversation_v1+memoir_v1")
|
|||
|
|
composite_weights_json = Column(JSON, nullable=True)
|
|||
|
|
status = Column(String, nullable=False, default="pending")
|
|||
|
|
error_message = Column(Text, nullable=True)
|
|||
|
|
created_at = Column(DateTime(timezone=True), default=utc_now, nullable=False)
|
|||
|
|
completed_at = Column(DateTime(timezone=True), nullable=True)
|
|||
|
|
|
|||
|
|
regression_set = relationship("EvalRegressionSet", back_populates="experiments")
|
|||
|
|
baseline_version = relationship(
|
|||
|
|
"EvalVersion",
|
|||
|
|
foreign_keys=[baseline_version_id],
|
|||
|
|
)
|
|||
|
|
candidate_version = relationship(
|
|||
|
|
"EvalVersion",
|
|||
|
|
foreign_keys=[candidate_version_id],
|
|||
|
|
)
|
|||
|
|
runs = relationship(
|
|||
|
|
"EvalRun", back_populates="experiment", cascade="all, delete-orphan"
|
|||
|
|
)
|
|||
|
|
gate_verdict = relationship(
|
|||
|
|
"EvalGateVerdict",
|
|||
|
|
back_populates="experiment",
|
|||
|
|
uselist=False,
|
|||
|
|
cascade="all, delete-orphan",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class EvalRun(Base):
|
|||
|
|
"""单次:某 experiment × 某 case × baseline 或 candidate。"""
|
|||
|
|
|
|||
|
|
__tablename__ = "eval_runs"
|
|||
|
|
|
|||
|
|
id = Column(String, primary_key=True)
|
|||
|
|
experiment_id = Column(String, ForeignKey("eval_experiments.id"), nullable=False)
|
|||
|
|
case_id = Column(String, ForeignKey("eval_cases.id"), nullable=False)
|
|||
|
|
side = Column(String, nullable=False)
|
|||
|
|
status = Column(String, nullable=False, default="pending")
|
|||
|
|
error_message = Column(Text, nullable=True)
|
|||
|
|
memoir_markdown = Column(Text, nullable=True)
|
|||
|
|
conversation_score_total = Column(Float, nullable=True)
|
|||
|
|
memoir_score_total = Column(Float, nullable=True)
|
|||
|
|
composite_score = Column(Float, nullable=True)
|
|||
|
|
judge_bundle_json = Column(JSON, nullable=True)
|
|||
|
|
started_at = Column(DateTime(timezone=True), nullable=True)
|
|||
|
|
completed_at = Column(DateTime(timezone=True), nullable=True)
|
|||
|
|
|
|||
|
|
experiment = relationship("EvalExperiment", back_populates="runs")
|
|||
|
|
case = relationship("EvalCase", back_populates="runs")
|
|||
|
|
turns = relationship(
|
|||
|
|
"EvalRunTurn",
|
|||
|
|
back_populates="run",
|
|||
|
|
cascade="all, delete-orphan",
|
|||
|
|
order_by="EvalRunTurn.turn_index",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
__table_args__ = (
|
|||
|
|
UniqueConstraint(
|
|||
|
|
"experiment_id",
|
|||
|
|
"case_id",
|
|||
|
|
"side",
|
|||
|
|
name="uq_eval_run_experiment_case_side",
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class EvalRunTurn(Base):
|
|||
|
|
__tablename__ = "eval_run_turns"
|
|||
|
|
|
|||
|
|
id = Column(String, primary_key=True)
|
|||
|
|
run_id = Column(String, ForeignKey("eval_runs.id"), nullable=False)
|
|||
|
|
turn_index = Column(Integer, nullable=False)
|
|||
|
|
user_utterance = Column(Text, nullable=False)
|
|||
|
|
assistant_reply = Column(Text, nullable=True)
|
|||
|
|
duration_ms = Column(Integer, nullable=True)
|
|||
|
|
judge_scores_json = Column(JSON, nullable=True)
|
|||
|
|
judge_rationale = Column(Text, nullable=True)
|
|||
|
|
|
|||
|
|
run = relationship("EvalRun", back_populates="turns")
|
|||
|
|
|
|||
|
|
__table_args__ = (
|
|||
|
|
UniqueConstraint("run_id", "turn_index", name="uq_eval_run_turn_index"),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class EvalGateVerdict(Base):
|
|||
|
|
__tablename__ = "eval_gate_verdicts"
|
|||
|
|
|
|||
|
|
id = Column(String, primary_key=True)
|
|||
|
|
experiment_id = Column(
|
|||
|
|
String, ForeignKey("eval_experiments.id"), nullable=False, unique=True
|
|||
|
|
)
|
|||
|
|
passed = Column(Boolean, nullable=False)
|
|||
|
|
mean_composite_delta = Column(Float, nullable=True)
|
|||
|
|
protected_regressions_json = Column(JSON, nullable=True)
|
|||
|
|
details_json = Column(JSON, nullable=True)
|
|||
|
|
computed_at = Column(DateTime(timezone=True), default=utc_now, nullable=False)
|
|||
|
|
|
|||
|
|
experiment = relationship("EvalExperiment", back_populates="gate_verdict")
|