feat/ eval
This commit is contained in:
@@ -2,32 +2,55 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Annotated
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.db import get_async_db
|
||||
from app.features.evaluation.admin_service import EvaluationAdminService
|
||||
from app.features.evaluation.deps import get_evaluation_admin_service
|
||||
from app.features.evaluation.deps import (
|
||||
get_eval_judge_manual_service,
|
||||
get_evaluation_admin_service,
|
||||
get_replay_conversation_service,
|
||||
)
|
||||
from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
)
|
||||
from app.features.evaluation.importers.user_export_markdown import (
|
||||
extract_memoir_chapter_sections_from_export_md,
|
||||
extract_source_user_id_from_export_md,
|
||||
)
|
||||
from app.features.evaluation.internal_auth import InternalEvalAuth
|
||||
from app.features.evaluation.judge_manual_service import EvalJudgeManualService
|
||||
from app.features.evaluation.presenters import case_out, run_out
|
||||
from app.features.evaluation.replay_service import ReplayConversationService
|
||||
from app.features.evaluation.schemas import (
|
||||
CaseCreate,
|
||||
CaseOut,
|
||||
EvalRunOut,
|
||||
EvalSandboxOut,
|
||||
ExperimentCreate,
|
||||
ExperimentDetailOut,
|
||||
ExperimentOut,
|
||||
GateVerdictOut,
|
||||
ImportJsonCaseBody,
|
||||
ImportMarkdownBody,
|
||||
ManualJudgeConversationBody,
|
||||
ManualJudgeConversationOut,
|
||||
ManualJudgeConversationStreamBody,
|
||||
ManualJudgeMemoirBody,
|
||||
ManualJudgeMemoirOut,
|
||||
MemoirSectionBaselineOut,
|
||||
RegressionSetCreate,
|
||||
RegressionSetOut,
|
||||
ReplayBootstrapBody,
|
||||
ReplayBootstrapOut,
|
||||
ReplayConversationBody,
|
||||
ReplayConversationOut,
|
||||
SessionDialogueOut,
|
||||
SessionEvalRunsOut,
|
||||
SessionListItem,
|
||||
@@ -37,10 +60,12 @@ from app.features.evaluation.schemas import (
|
||||
UserExportFixtureDetailOut,
|
||||
UserExportFixtureListOut,
|
||||
UserExportFixtureTurnOut,
|
||||
UserMemoirSnapshotOut,
|
||||
VersionCreate,
|
||||
VersionOut,
|
||||
)
|
||||
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
||||
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
|
||||
|
||||
router = APIRouter(tags=["internal-evaluation"])
|
||||
|
||||
@@ -209,6 +234,175 @@ async def list_session_evaluation_runs(
|
||||
return await svc.list_session_evaluation_runs(conversation_id)
|
||||
|
||||
|
||||
@router.post("/sessions/replay-bootstrap", response_model=ReplayBootstrapOut)
|
||||
async def replay_bootstrap(
|
||||
body: ReplayBootstrapBody,
|
||||
_auth: InternalEvalAuth,
|
||||
replay: Annotated[
|
||||
ReplayConversationService, Depends(get_replay_conversation_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
cid = await replay.bootstrap_conversation(body.user_id)
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ReplayBootstrapOut(conversation_id=cid)
|
||||
|
||||
|
||||
@router.post("/sessions/eval-sandbox", response_model=EvalSandboxOut)
|
||||
async def create_eval_sandbox(
|
||||
_auth: InternalEvalAuth,
|
||||
replay: Annotated[
|
||||
ReplayConversationService, Depends(get_replay_conversation_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
uid, cid, phone, nick = await replay.create_eval_sandbox()
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return EvalSandboxOut(
|
||||
user_id=uid,
|
||||
conversation_id=cid,
|
||||
phone=phone,
|
||||
nickname=nick,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/replay/conversation", response_model=ReplayConversationOut)
|
||||
async def replay_conversation(
|
||||
body: ReplayConversationBody,
|
||||
_auth: InternalEvalAuth,
|
||||
replay: Annotated[
|
||||
ReplayConversationService, Depends(get_replay_conversation_service)
|
||||
],
|
||||
):
|
||||
if body.fixture_filename and body.user_utterances:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="provide only one of fixture_filename or user_utterances",
|
||||
)
|
||||
try:
|
||||
if body.fixture_filename:
|
||||
fn = body.fixture_filename.strip()
|
||||
n, echo = await replay.replay_fixture(
|
||||
conversation_id=body.conversation_id,
|
||||
fixture_filename=fn,
|
||||
flush_memoir_after=body.flush_memoir_after,
|
||||
skip_tts=body.skip_tts,
|
||||
)
|
||||
elif body.user_utterances is not None:
|
||||
utt = [str(u) for u in body.user_utterances if str(u).strip()]
|
||||
if not utt:
|
||||
raise EvaluationBadRequestError("user_utterances is empty")
|
||||
n = await replay.replay_utterances(
|
||||
conversation_id=body.conversation_id,
|
||||
utterances=utt,
|
||||
flush_memoir_after=body.flush_memoir_after,
|
||||
skip_tts=body.skip_tts,
|
||||
)
|
||||
echo = utt
|
||||
else:
|
||||
raise EvaluationBadRequestError(
|
||||
"fixture_filename or user_utterances required"
|
||||
)
|
||||
except EvaluationNotFoundError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ReplayConversationOut(
|
||||
conversation_id=body.conversation_id,
|
||||
turns_replayed=n,
|
||||
utterances_echo=echo,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/judge/conversation", response_model=ManualJudgeConversationOut)
|
||||
async def judge_conversation_manual(
|
||||
body: ManualJudgeConversationBody,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
payload = await judge_svc.judge_conversation(
|
||||
body.conversation_id,
|
||||
body.fixture_filename,
|
||||
)
|
||||
except EvaluationNotFoundError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ManualJudgeConversationOut.model_validate(payload)
|
||||
|
||||
|
||||
@router.post("/judge/conversation-stream")
|
||||
async def judge_conversation_manual_stream(
|
||||
body: ManualJudgeConversationStreamBody,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
async def event_iter():
|
||||
try:
|
||||
async for evt in judge_svc.iter_conversation_judge_sse(
|
||||
body.conversation_id,
|
||||
body.fixture_filename,
|
||||
):
|
||||
yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n"
|
||||
except Exception as e:
|
||||
err = json.dumps(
|
||||
{"event": "error", "phase": "server", "message": str(e)},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
yield f"data: {err}\n\n"
|
||||
yield f"data: {json.dumps({'event': 'done'}, ensure_ascii=False)}\n\n"
|
||||
|
||||
return StreamingResponse(
|
||||
event_iter(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/judge/memoir-chapters", response_model=ManualJudgeMemoirOut)
|
||||
async def judge_memoir_chapters_manual(
|
||||
body: ManualJudgeMemoirBody,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
payload = await judge_svc.judge_memoir_for_user(
|
||||
body.user_id,
|
||||
body.baseline_sections,
|
||||
)
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ManualJudgeMemoirOut.model_validate(payload)
|
||||
|
||||
|
||||
@router.get("/users/{user_id}/memoir-snapshot", response_model=UserMemoirSnapshotOut)
|
||||
async def get_user_memoir_snapshot(
|
||||
user_id: str,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
payload = await judge_svc.memoir_snapshot(user_id)
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return UserMemoirSnapshotOut.model_validate(payload)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/fixtures/user-exports",
|
||||
response_model=UserExportFixtureListOut,
|
||||
@@ -227,19 +421,23 @@ async def list_user_export_fixtures(
|
||||
async def get_user_export_fixture(
|
||||
filename: str,
|
||||
_auth: InternalEvalAuth,
|
||||
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
|
||||
):
|
||||
try:
|
||||
turns = svc.load_user_export_fixture_turns(filename)
|
||||
turns, raw_md = read_user_export_fixture(filename)
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="invalid fixture filename"
|
||||
) from None
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="fixture not found") from None
|
||||
memoir_tuples = extract_memoir_chapter_sections_from_export_md(raw_md)
|
||||
return UserExportFixtureDetailOut(
|
||||
filename=filename,
|
||||
turns=[UserExportFixtureTurnOut(user=u, ai=a) for u, a in turns],
|
||||
source_user_id=extract_source_user_id_from_export_md(raw_md),
|
||||
memoir_sections=[
|
||||
MemoirSectionBaselineOut(title=t, body=b) for t, b in memoir_tuples
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user