diff --git a/api/app/features/evaluation/admin_service.py b/api/app/features/evaluation/admin_service.py index a269de7..a58130c 100644 --- a/api/app/features/evaluation/admin_service.py +++ b/api/app/features/evaluation/admin_service.py @@ -26,16 +26,25 @@ from app.features.evaluation.models import ( EvalRunTurn, EvalVersion, ) +from app.features.evaluation.presenters import run_out from app.features.evaluation.schemas import ( CaseCreate, ExperimentCreate, ImportJsonCaseBody, ImportMarkdownBody, RegressionSetCreate, + SessionEvalRunItem, + SessionEvalRunsOut, SnapshotFromConversationBody, VersionCreate, ) from app.features.evaluation.session_catalog_service import SessionCatalogService +from app.features.evaluation.user_export_fixtures import ( + list_user_export_fixture_names as list_user_export_md_filenames, +) +from app.features.evaluation.user_export_fixtures import ( + read_user_export_fixture, +) from app.tasks.evaluation_tasks import run_eval_experiment_task @@ -188,6 +197,23 @@ class EvaluationAdminService: async def list_experiments(self, *, limit: int) -> list[EvalExperiment]: return await eval_repo.list_experiments(self._db, limit=limit) + async def list_session_evaluation_runs( + self, conversation_id: str + ) -> SessionEvalRunsOut: + rows = await eval_repo.list_runs_for_source_conversation( + self._db, source_conversation_id=conversation_id + ) + items: list[SessionEvalRunItem] = [] + for run, _case, exp in rows: + turns = await eval_repo.list_turns(self._db, run.id) + items.append( + SessionEvalRunItem( + experiment_name=exp.name, + run=run_out(run, turns), + ) + ) + return SessionEvalRunsOut(conversation_id=conversation_id, items=items) + async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment: rs = await eval_repo.get_regression_set(self._db, body.regression_set_id) if not rs: @@ -232,7 +258,6 @@ class EvaluationAdminService: async def experiment_stream_snapshot( self, experiment_id: str ) -> dict[str, Any] | None: - from app.features.evaluation.presenters import run_out from app.features.evaluation.schemas import GateVerdictOut exp = await eval_repo.get_experiment(self._db, experiment_id) @@ -250,3 +275,10 @@ class EvaluationAdminService: "runs": run_payload, "gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None, } + + def list_user_export_fixture_names(self) -> list[str]: + return list_user_export_md_filenames() + + def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]: + turns, _ = read_user_export_fixture(filename) + return turns diff --git a/api/app/features/evaluation/execution_service.py b/api/app/features/evaluation/execution_service.py index 39943ce..204012b 100644 --- a/api/app/features/evaluation/execution_service.py +++ b/api/app/features/evaluation/execution_service.py @@ -23,6 +23,17 @@ from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion logger = get_logger(__name__) +_MAX_JUDGE_MARKDOWN_CHARS = 20_000 +_MAX_EVAL_CHAPTERS = 30 +_MAX_EVAL_STORIES = 40 + + +def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str: + s = (text or "").strip() + if len(s) <= max_chars: + return s + return f"{s[:max_chars]}\n\n…(已截断供评审)" + def _composite( conv: float | None, mem: float | None, weights: dict[str, Any] | None @@ -149,7 +160,66 @@ async def execute_eval_run( memoir_md = simple_memoir_from_transcript(utterances, replies) mem_out = await judge.judge_memoir(memoir_markdown=memoir_md) - mem_total = mem_out.total_score if mem_out else None + + chapter_entries: list[dict[str, Any]] = [] + story_entries: list[dict[str, Any]] = [] + uid = (case.source_user_id or "").strip() + if uid: + from app.features.memoir.repo import get_chapters_for_memoir_list + from app.features.story.repo import get_stories_for_user + + try: + chapters = await get_chapters_for_memoir_list( + uid, db, active_only=True, is_new_only=None + ) + for ch in chapters[:_MAX_EVAL_CHAPTERS]: + body = (ch.canonical_markdown or "").strip() + if not body: + continue + md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}" + cj = await judge.judge_memoir(memoir_markdown=md) + chapter_entries.append( + { + "id": ch.id, + "title": ch.title, + "order_index": ch.order_index, + "judge": cj.model_dump() if cj else None, + } + ) + except Exception as e: + logger.warning("eval chapter judges skipped: {}", e) + + try: + stories = await get_stories_for_user(db, uid, status="active") + for st in stories[:_MAX_EVAL_STORIES]: + body = (st.canonical_markdown or "").strip() + if not body: + continue + md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}" + sj = await judge.judge_memoir(memoir_markdown=md) + story_entries.append( + { + "id": st.id, + "title": st.title, + "stage": st.stage, + "judge": sj.model_dump() if sj else None, + } + ) + except Exception as e: + logger.warning("eval story judges skipped: {}", e) + + mem_parts: list[float] = [] + if mem_out is not None: + mem_parts.append(float(mem_out.total_score)) + for row in chapter_entries: + j = row.get("judge") + if isinstance(j, dict) and j.get("total_score") is not None: + mem_parts.append(float(j["total_score"])) + for row in story_entries: + j = row.get("judge") + if isinstance(j, dict) and j.get("total_score") is not None: + mem_parts.append(float(j["total_score"])) + mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None exp = await eval_repo.get_experiment(db, run.experiment_id) weights = exp.composite_weights_json if exp else None @@ -158,6 +228,8 @@ async def execute_eval_run( bundle: dict[str, Any] = { "conversation_judge": conv_out.model_dump() if conv_out else None, "memoir_judge": mem_out.model_dump() if mem_out else None, + "chapters": chapter_entries, + "stories": story_entries, } await eval_repo.update_run( db, diff --git a/api/app/features/evaluation/importers/user_export_markdown.py b/api/app/features/evaluation/importers/user_export_markdown.py index cc03580..92722cb 100644 --- a/api/app/features/evaluation/importers/user_export_markdown.py +++ b/api/app/features/evaluation/importers/user_export_markdown.py @@ -17,3 +17,30 @@ def extract_user_utterances_from_export_md(text: str) -> list[str]: if chunk and chunk != "(空)": out.append(chunk) return out + + +def extract_dialogue_turns_from_export_md(text: str) -> list[tuple[str, str]]: + """ + 从 extract_sql_to_user_md 导出的 Markdown 中按「轮次」提取 (用户, AI) 对,供评测台对照。 + """ + chunks = re.split(r"\n####\s*轮次\s*\d+[^\n]*", text) + out: list[tuple[str, str]] = [] + for chunk in chunks[1:]: + user_m = re.search( + r"\*\*用户:\*\*\s*\n+(.+?)(?=\n\*\*AI:\*\*)", + chunk, + flags=re.DOTALL | re.IGNORECASE, + ) + ai_m = re.search( + r"\*\*AI:\*\*\s*\n+(.+?)(?=\n####\s|\n###\s+[^#]|\Z)", + chunk, + flags=re.DOTALL | re.IGNORECASE, + ) + if not user_m: + continue + u = (user_m.group(1) or "").strip() + if not u or u == "(空)": + continue + a = ((ai_m.group(1) if ai_m else "") or "").strip() + out.append((u, a)) + return out diff --git a/api/app/features/evaluation/presenters.py b/api/app/features/evaluation/presenters.py index 7223cea..7a655b4 100644 --- a/api/app/features/evaluation/presenters.py +++ b/api/app/features/evaluation/presenters.py @@ -21,5 +21,6 @@ def run_out(row, turns: list) -> EvalRunOut: conversation_score_total=row.conversation_score_total, memoir_score_total=row.memoir_score_total, composite_score=row.composite_score, + judge_bundle_json=row.judge_bundle_json, turns=[RunTurnOut.model_validate(t) for t in turns], ) diff --git a/api/app/features/evaluation/repo.py b/api/app/features/evaluation/repo.py index 70c4f0e..0e3c105 100644 --- a/api/app/features/evaluation/repo.py +++ b/api/app/features/evaluation/repo.py @@ -204,6 +204,27 @@ async def list_runs_for_experiment( return list(res.scalars().all()) +async def list_runs_for_source_conversation( + db: AsyncSession, + *, + source_conversation_id: str, + limit: int = 80, +) -> list[tuple[EvalRun, EvalCase, EvalExperiment]]: + stmt = ( + select(EvalRun, EvalCase, EvalExperiment) + .join(EvalCase, EvalRun.case_id == EvalCase.id) + .join(EvalExperiment, EvalRun.experiment_id == EvalExperiment.id) + .where(EvalCase.source_conversation_id == source_conversation_id) + .order_by( + EvalRun.completed_at.desc().nulls_last(), + EvalRun.started_at.desc().nulls_last(), + ) + .limit(limit) + ) + res = await db.execute(stmt) + return list(res.all()) + + async def update_run( db: AsyncSession, run: EvalRun, diff --git a/api/app/features/evaluation/router.py b/api/app/features/evaluation/router.py index 848461b..2568480 100644 --- a/api/app/features/evaluation/router.py +++ b/api/app/features/evaluation/router.py @@ -28,10 +28,15 @@ from app.features.evaluation.schemas import ( ImportMarkdownBody, RegressionSetCreate, RegressionSetOut, + SessionDialogueOut, + SessionEvalRunsOut, SessionListItem, SessionListResponse, SessionTranscriptOut, SnapshotFromConversationBody, + UserExportFixtureDetailOut, + UserExportFixtureListOut, + UserExportFixtureTurnOut, VersionCreate, VersionOut, ) @@ -40,6 +45,12 @@ from app.features.evaluation.session_catalog_service import SessionCatalogServic router = APIRouter(tags=["internal-evaluation"]) +@router.get("/ping", include_in_schema=False) +async def eval_api_ping() -> dict[str, str | bool]: + """无鉴权:确认当前进程是 internal_main 且路由已挂载。""" + return {"ok": True, "service": "life-echo-internal-eval"} + + def _eval_http_exc( e: EvaluationNotFoundError | EvaluationBadRequestError, ) -> HTTPException: @@ -123,17 +134,23 @@ async def list_sessions( limit: int = Query(50, ge=1, le=200), user_id: str | None = Query(None), q: str | None = Query(None), + status: str | None = Query( + None, + description="按会话 status 过滤,如 active", + ), ): catalog = SessionCatalogService(db) rows, total = await catalog.list_sessions( - offset=offset, limit=limit, user_id=user_id, q=q + offset=offset, limit=limit, user_id=user_id, q=q, status=status ) return SessionListResponse( items=[ SessionListItem( id=r.id, user_id=r.user_id, + user_phone=r.user_phone, started_at=r.started_at, + last_message_at=r.last_message_at, conversation_stage=r.conversation_stage, current_topic=r.current_topic, status=r.status, @@ -144,6 +161,22 @@ async def list_sessions( ) +@router.get( + "/sessions/{conversation_id}/dialogue", + response_model=SessionDialogueOut, +) +async def get_session_dialogue( + conversation_id: str, + _auth: InternalEvalAuth, + db: Annotated[AsyncSession, Depends(get_async_db)], +): + catalog = SessionCatalogService(db) + out = await catalog.get_session_dialogue(conversation_id) + if not out: + raise HTTPException(status_code=404, detail="conversation not found") + return out + + @router.get( "/sessions/{conversation_id}/transcript", response_model=SessionTranscriptOut ) @@ -164,6 +197,52 @@ async def get_session_transcript( ) +@router.get( + "/sessions/{conversation_id}/evaluation-runs", + response_model=SessionEvalRunsOut, +) +async def list_session_evaluation_runs( + conversation_id: str, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + return await svc.list_session_evaluation_runs(conversation_id) + + +@router.get( + "/fixtures/user-exports", + response_model=UserExportFixtureListOut, +) +async def list_user_export_fixtures( + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + return UserExportFixtureListOut(items=svc.list_user_export_fixture_names()) + + +@router.get( + "/fixtures/user-exports/{filename}", + response_model=UserExportFixtureDetailOut, +) +async def get_user_export_fixture( + filename: str, + _auth: InternalEvalAuth, + svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], +): + try: + turns = svc.load_user_export_fixture_turns(filename) + except ValueError: + raise HTTPException( + status_code=400, detail="invalid fixture filename" + ) from None + except FileNotFoundError: + raise HTTPException(status_code=404, detail="fixture not found") from None + return UserExportFixtureDetailOut( + filename=filename, + turns=[UserExportFixtureTurnOut(user=u, ai=a) for u, a in turns], + ) + + @router.post("/regression-sets/{set_id}/import-markdown", response_model=CaseOut) async def import_markdown_case( set_id: str, diff --git a/api/app/features/evaluation/schemas.py b/api/app/features/evaluation/schemas.py index 86135e3..386f68a 100644 --- a/api/app/features/evaluation/schemas.py +++ b/api/app/features/evaluation/schemas.py @@ -88,10 +88,25 @@ class ExperimentOut(BaseModel): completed_at: datetime | None +class SessionDialogueMessageOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + + role: str + content: str + created_at: datetime | None = None + + +class SessionDialogueOut(BaseModel): + conversation_id: str + messages: list[SessionDialogueMessageOut] + + class SessionListItem(BaseModel): id: str user_id: str + user_phone: str | None = Field(default=None, description="users.phone,列表展示用") started_at: datetime | None + last_message_at: datetime | None = None conversation_stage: str | None current_topic: str | None status: str | None @@ -109,6 +124,20 @@ class SessionTranscriptOut(BaseModel): user_utterances_from_messages: list[str] +class UserExportFixtureTurnOut(BaseModel): + user: str + ai: str + + +class UserExportFixtureListOut(BaseModel): + items: list[str] + + +class UserExportFixtureDetailOut(BaseModel): + filename: str + turns: list[UserExportFixtureTurnOut] + + class SnapshotFromConversationBody(BaseModel): title: str | None = None use_messages: bool = False @@ -157,9 +186,20 @@ class EvalRunOut(BaseModel): conversation_score_total: float | None memoir_score_total: float | None composite_score: float | None + judge_bundle_json: dict[str, Any] | None = None turns: list[RunTurnOut] = [] +class SessionEvalRunItem(BaseModel): + experiment_name: str + run: EvalRunOut + + +class SessionEvalRunsOut(BaseModel): + conversation_id: str + items: list[SessionEvalRunItem] + + class GateVerdictOut(BaseModel): model_config = ConfigDict(from_attributes=True) diff --git a/api/app/features/evaluation/session_catalog_repo.py b/api/app/features/evaluation/session_catalog_repo.py index 9af2eb1..41a4e8a 100644 --- a/api/app/features/evaluation/session_catalog_repo.py +++ b/api/app/features/evaluation/session_catalog_repo.py @@ -4,6 +4,7 @@ from __future__ import annotations from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload from app.features.conversation.models import Conversation, ConversationMessage, Segment @@ -12,12 +13,14 @@ class SessionCatalogRepo: def __init__(self, db: AsyncSession) -> None: self._db = db - async def count_conversations(self) -> int: + async def count_conversations(self, *, status: str | None = None) -> int: q = ( select(func.count()) .select_from(Conversation) .where(Conversation.deleted_at.is_(None)) ) + if status: + q = q.where(Conversation.status == status) r = await self._db.execute(q) return int(r.scalar() or 0) @@ -28,11 +31,20 @@ class SessionCatalogRepo: limit: int = 50, user_id: str | None = None, q_text: str | None = None, + status: str | None = None, ) -> list[Conversation]: stmt = select(Conversation).where(Conversation.deleted_at.is_(None)) if user_id: stmt = stmt.where(Conversation.user_id == user_id) - stmt = stmt.order_by(Conversation.started_at.desc().nullslast()) + if status: + stmt = stmt.where(Conversation.status == status) + if status == "active": + stmt = stmt.order_by( + Conversation.last_message_at.desc().nullslast(), + Conversation.started_at.desc().nullslast(), + ) + else: + stmt = stmt.order_by(Conversation.started_at.desc().nullslast()) stmt = stmt.offset(offset).limit(limit) # q_text: 简单按 topic 搜索(后续可扩展全文) if q_text: @@ -41,6 +53,7 @@ class SessionCatalogRepo: (Conversation.current_topic.isnot(None)) & (Conversation.current_topic.ilike(like)) ) + stmt = stmt.options(joinedload(Conversation.user)) res = await self._db.execute(stmt) return list(res.scalars().unique().all()) diff --git a/api/app/features/evaluation/session_catalog_service.py b/api/app/features/evaluation/session_catalog_service.py index f3d3da2..a37e153 100644 --- a/api/app/features/evaluation/session_catalog_service.py +++ b/api/app/features/evaluation/session_catalog_service.py @@ -6,6 +6,10 @@ from dataclasses import dataclass from sqlalchemy.ext.asyncio import AsyncSession +from app.features.evaluation.schemas import ( + SessionDialogueMessageOut, + SessionDialogueOut, +) from app.features.evaluation.session_catalog_repo import SessionCatalogRepo @@ -13,7 +17,9 @@ from app.features.evaluation.session_catalog_repo import SessionCatalogRepo class SessionSummary: id: str user_id: str + user_phone: str | None started_at: object | None + last_message_at: object | None conversation_stage: str | None current_topic: str | None status: str | None @@ -38,16 +44,23 @@ class SessionCatalogService: limit: int = 50, user_id: str | None = None, q: str | None = None, + status: str | None = None, ) -> tuple[list[SessionSummary], int]: - total = await self._repo.count_conversations() + total = await self._repo.count_conversations(status=status) rows = await self._repo.list_conversations( - offset=offset, limit=limit, user_id=user_id, q_text=q + offset=offset, + limit=limit, + user_id=user_id, + q_text=q, + status=status, ) out = [ SessionSummary( id=c.id, user_id=c.user_id, + user_phone=c.user.phone if c.user is not None else None, started_at=c.started_at, + last_message_at=c.last_message_at, conversation_stage=c.conversation_stage, current_topic=c.current_topic, status=c.status, @@ -56,6 +69,25 @@ class SessionCatalogService: ] return out, total + async def get_session_dialogue( + self, conversation_id: str + ) -> SessionDialogueOut | None: + c = await self._repo.get_conversation(conversation_id) + if not c or c.deleted_at: + return None + msgs = await self._repo.list_messages_for_conversation(conversation_id) + return SessionDialogueOut( + conversation_id=conversation_id, + messages=[ + SessionDialogueMessageOut( + role=m.role, + content=m.content, + created_at=m.created_at, + ) + for m in msgs + ], + ) + async def get_transcript(self, conversation_id: str) -> SessionTranscript | None: c = await self._repo.get_conversation(conversation_id) if not c or c.deleted_at: diff --git a/api/app/features/evaluation/user_export_fixtures.py b/api/app/features/evaluation/user_export_fixtures.py new file mode 100644 index 0000000..0ef39f3 --- /dev/null +++ b/api/app/features/evaluation/user_export_fixtures.py @@ -0,0 +1,36 @@ +"""只读加载 api/tests/user_exports/*.md,供内部评测台对照(非生产数据路径)。""" + +from __future__ import annotations + +import re +from pathlib import Path + +from app.features.evaluation.importers.user_export_markdown import ( + extract_dialogue_turns_from_export_md, +) + +_SAFE_MD = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_.-]*\.md$") + + +def user_exports_dir() -> Path: + # api/app/features/evaluation/user_export_fixtures.py → api/ + return Path(__file__).resolve().parents[3] / "tests" / "user_exports" + + +def list_user_export_fixture_names() -> list[str]: + root = user_exports_dir() + if not root.is_dir(): + return [] + return sorted(p.name for p in root.glob("*.md")) + + +def read_user_export_fixture(filename: str) -> tuple[list[tuple[str, str]], str]: + if not _SAFE_MD.match(filename): + raise ValueError("invalid fixture filename") + root = user_exports_dir() + path = (root / filename).resolve() + if path.parent != root.resolve() or not path.is_file(): + raise FileNotFoundError(filename) + text = path.read_text(encoding="utf-8") + turns = extract_dialogue_turns_from_export_md(text) + return turns, text diff --git a/api/app/internal_main.py b/api/app/internal_main.py index 666822c..c0c34a4 100644 --- a/api/app/internal_main.py +++ b/api/app/internal_main.py @@ -16,6 +16,7 @@ setup_logging() from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import HTMLResponse from fastapi.staticfiles import StaticFiles from app.core.config import settings @@ -53,6 +54,27 @@ internal_app.add_middleware( register_exception_handlers(internal_app) +@internal_app.get("/", include_in_schema=False, response_class=HTMLResponse) +async def internal_eval_landing(): + """浏览器打开 :8001 根路径时提示:界面在 Vite(默认 5174),本进程仅为 API。""" + docs_hint = ( + '

OpenAPI 文档 /docs

' + if settings.internal_eval_enable_docs + else "

(未开启文档;设置 INTERNAL_EVAL_ENABLE_DOCS=1 后可访问 /docs)

" + ) + return f""" +内部评测 API + +

Life Echo · 内部回归评测 API

+

这里是 HTTP API(端口由启动命令决定),没有内置网页。 +浏览「回归评测台」请在仓库执行 ./internal-eval.shcd app-eval-web && npm run dev, +在终端里打开 Vite 给出的地址(一般为 http://127.0.0.1:5174/)。

+

健康检查:/health

+{docs_hint} +

会话与对比接口前缀:/internal/api/evaluation/

+""" + + @internal_app.on_event("startup") async def _startup(): import asyncio diff --git a/api/docs/internal-eval.md b/api/docs/internal-eval.md index c34e967..25b80b1 100644 --- a/api/docs/internal-eval.md +++ b/api/docs/internal-eval.md @@ -4,6 +4,24 @@ ## 启动 +一键脚本 `internal-eval.sh` 与 `development.sh` **不是重复各启一套主站**: + +| | `development.sh` | `internal-eval.sh` | +|---|------------------|---------------------| +| HTTP | 主站 `main:app`(默认 **8000**) | 仅评测 `internal_app`(默认 **8001**) | +| Celery | 会起一个 worker | 默认也会起一个 worker(可与下面「瘦启动」二选一) | + +评测分析只需要 **8001 上的 internal API**;若你已经在跑 `development.sh`(DB/Redis/主站/已有 Celery),不必再起第二份基础设施和 worker: + +```bash +cd api +chmod +x internal-eval.sh +# 确保 .env.development 或 .env 里有 INTERNAL_EVAL_API_KEY +SKIP_INFRA=1 SKIP_INSTALL=1 SKIP_CELERY=1 ./internal-eval.sh # 推荐:只多开 8001 +``` + +全新机器、只跑评测栈时可直接 `./internal-eval.sh`(会起 docker、`uv sync`、迁移、8001 + Celery)。**默认会起 `app-eval-web`,并用 Vite `--open` 尝试打开浏览器**(`http://127.0.0.1:5174/`)。不要前端时设 `START_EVAL_WEB=0`;只要前端但不要弹窗时设 `OPEN_EVAL_WEB=0`。 + 数据库与主服务共用;需配置环境变量后启动专用进程: ```bash diff --git a/api/internal-eval.sh b/api/internal-eval.sh new file mode 100755 index 0000000..4033d7e --- /dev/null +++ b/api/internal-eval.sh @@ -0,0 +1,374 @@ +#!/usr/bin/env bash + +# 仅启动「内部回归评测」栈(app/internal_main.py),不启动主站 consumer API。 +# +# 与 development.sh 的区别: +# - development.sh:main:app + Celery(通常 :8000),面向 App/主业务。 +# - internal-eval.sh:internal_app + Celery(:8001),仅评测/回放/GLM 打分/门禁。 +# 二者共用数据库与 Redis;不会拉起第二份 main:app。 +# +# 若本机已在跑 ./development.sh,只想多开评测 HTTP(推荐,避免第二套 worker/docker): +# SKIP_INFRA=1 SKIP_INSTALL=1 SKIP_CELERY=1 ./internal-eval.sh +# +# 用法:cd api && ./internal-eval.sh +# 可选环境变量: +# SKIP_INFRA=1 已起好 Postgres/Redis 时跳过 docker compose +# SKIP_INSTALL=1 跳过 uv sync +# SKIP_CELERY=1 仅起内部 API(别处已有 Celery worker 时) +# START_EVAL_WEB=0 不起评测前端(默认会起 app-eval-web,需已 npm install) +# OPEN_EVAL_WEB=0 起前端但不自动打开浏览器(默认 Vite --open) +# EVAL_WEB_PORT 打印提示用,默认 5174(与 app-eval-web/vite.config.ts 一致) +# INTERNAL_EVAL_PORT 默认 8001 +# CELERY_POOL 默认 solo(与 development.sh 一致) + +set -euo pipefail + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${ROOT_DIR}/.." && pwd)" +EVAL_WEB_DIR="${REPO_ROOT}/app-eval-web" + +VENV_DIR="${ROOT_DIR}/.venv" +UVICORN_BIN="${VENV_DIR}/bin/uvicorn" +CELERY_BIN="${VENV_DIR}/bin/celery" + +INTERNAL_EVAL_HOST="${INTERNAL_EVAL_HOST:-0.0.0.0}" +INTERNAL_EVAL_PORT="${INTERNAL_EVAL_PORT:-8001}" +CELERY_POOL="${CELERY_POOL:-solo}" +SKIP_INSTALL="${SKIP_INSTALL:-0}" +SKIP_INFRA="${SKIP_INFRA:-0}" +SKIP_CELERY="${SKIP_CELERY:-0}" +START_EVAL_WEB="${START_EVAL_WEB:-1}" +OPEN_EVAL_WEB="${OPEN_EVAL_WEB:-1}" +EVAL_WEB_PORT="${EVAL_WEB_PORT:-5174}" +SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-12}" + +API_PID="" +CELERY_PID="" +EVAL_WEB_PID="" +CLEANED_UP=0 +INFRA_STARTED=0 + +print_header() { + echo -e "\n${BLUE}========================================${NC}" + echo -e "${BLUE}$1${NC}" + echo -e "${BLUE}========================================${NC}" +} + +print_ok() { + echo -e "${GREEN}✓ $1${NC}" +} + +print_warn() { + echo -e "${YELLOW}⚠ $1${NC}" +} + +print_err() { + echo -e "${RED}✗ $1${NC}" +} + +is_pid_alive() { + local pid="$1" + [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null +} + +wait_pid_exit() { + local pid="$1" + local timeout="$2" + local waited=0 + + while is_pid_alive "${pid}"; do + if (( waited >= timeout )); then + return 1 + fi + sleep 1 + waited=$((waited + 1)) + done + return 0 +} + +kill_children_term() { + local pid="$1" + local children + + children="$(pgrep -P "${pid}" 2>/dev/null || true)" + if [[ -n "${children}" ]]; then + while IFS= read -r child_pid; do + [[ -z "${child_pid}" ]] && continue + kill_children_term "${child_pid}" + kill -TERM "${child_pid}" 2>/dev/null || true + done <<< "${children}" + fi +} + +stop_process_gracefully() { + local name="$1" + local pid="$2" + local timeout="${3:-10}" + + if ! is_pid_alive "${pid}"; then + print_ok "${name} 已退出" + return 0 + fi + + print_warn "正在停止 ${name}(PID: ${pid})..." + kill_children_term "${pid}" + kill -TERM "${pid}" 2>/dev/null || true + + if wait_pid_exit "${pid}" "${timeout}"; then + print_ok "${name} 已停止" + return 0 + fi + + print_warn "${name} 在 ${timeout}s 内未退出,准备强制结束" + kill -KILL "${pid}" 2>/dev/null || true + wait_pid_exit "${pid}" 3 || true + print_ok "${name} 已强制结束" +} + +cleanup() { + if [[ "${CLEANED_UP}" == "1" ]]; then + return 0 + fi + CLEANED_UP=1 + + print_header "正在关闭内部评测环境" + + if is_pid_alive "${EVAL_WEB_PID}"; then + stop_process_gracefully "eval-web (Vite)" "${EVAL_WEB_PID}" "${SHUTDOWN_TIMEOUT}" + fi + + if is_pid_alive "${API_PID}"; then + stop_process_gracefully "Internal Eval API" "${API_PID}" "${SHUTDOWN_TIMEOUT}" + fi + + if is_pid_alive "${CELERY_PID}"; then + stop_process_gracefully "Celery" "${CELERY_PID}" "${SHUTDOWN_TIMEOUT}" + fi + + if [[ "${INFRA_STARTED}" == "1" ]]; then + print_warn "正在停止 PostgreSQL / Redis 容器..." + ( + cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml stop + ) >/dev/null 2>&1 || true + print_ok "PostgreSQL/Redis 容器已停止" + fi +} + +require_cmd() { + local cmd="$1" + if ! command -v "${cmd}" >/dev/null 2>&1; then + print_err "未找到命令: ${cmd}" + exit 1 + fi +} + +start_infra() { + print_header "启动 PostgreSQL 和 Redis" + cd "${ROOT_DIR}" + docker compose -f docker-compose.dev.yml up -d + INFRA_STARTED=1 + print_ok "基础设施已就绪" +} + +wait_postgres_ready() { + local retries=30 + local i=0 + print_header "等待 PostgreSQL 就绪" + cd "${ROOT_DIR}" + while (( i < retries )); do + if docker compose -f docker-compose.dev.yml exec -T postgres \ + pg_isready -U postgres >/dev/null 2>&1; then + print_ok "PostgreSQL 已就绪" + return 0 + fi + sleep 1 + i=$((i + 1)) + done + print_warn "PostgreSQL 在 ${retries}s 内未就绪,迁移可能失败" + return 1 +} + +ensure_venv() { + print_header "检查 Python 虚拟环境" + + if [[ ! -d "${VENV_DIR}" ]]; then + print_warn ".venv 不存在,正在创建" + uv venv "${VENV_DIR}" + fi + + if [[ "${SKIP_INSTALL}" != "1" ]]; then + print_header "安装 Python 依赖" + uv sync + print_ok "依赖安装完成" + else + print_warn "已跳过依赖安装 (SKIP_INSTALL=1)" + fi +} + +ensure_dotenv_from_development() { + print_header "准备本地 .env" + if [[ -f "${ROOT_DIR}/.env.development" ]]; then + cp "${ROOT_DIR}/.env.development" "${ROOT_DIR}/.env" + print_ok "已从 .env.development 同步为 .env" + return 0 + fi + print_warn "未找到 .env.development,将使用现有 .env(若存在)" +} + +check_internal_eval_key() { + print_header "检查内部评测密钥" + if [[ -f "${ROOT_DIR}/.env" ]] && grep -qE '^INTERNAL_EVAL_API_KEY=.+' "${ROOT_DIR}/.env" 2>/dev/null; then + print_ok "已在 .env 中配置 INTERNAL_EVAL_API_KEY" + return 0 + fi + if [[ -n "${INTERNAL_EVAL_API_KEY:-}" ]]; then + print_ok "已从环境变量传入 INTERNAL_EVAL_API_KEY" + return 0 + fi + print_err "未配置 INTERNAL_EVAL_API_KEY:内部评测接口将返回 503。" + print_err "请在 api/.env.development(或 .env)中加入一行,例如:" + print_err " INTERNAL_EVAL_API_KEY=\"your-long-random-secret\"" + exit 1 +} + +check_env_file() { + print_header "检查环境变量文件" + if [[ ! -f "${ROOT_DIR}/.env" ]]; then + print_warn "未找到 .env,应用可能因缺少配置启动失败" + else + print_ok "检测到 .env" + fi +} + +run_migrations() { + print_header "执行数据库迁移" + cd "${ROOT_DIR}" + if uv run alembic upgrade head 2>/dev/null; then + print_ok "Alembic 迁移已就绪" + else + print_warn "Alembic 迁移失败(可能数据库未启动或 DATABASE_URL 未配置),应用启动可能失败" + fi +} + +start_eval_web() { + print_header "启动 app-eval-web (Vite)" + if [[ ! -d "${EVAL_WEB_DIR}" ]]; then + print_err "未找到 ${EVAL_WEB_DIR}" + exit 1 + fi + if [[ ! -d "${EVAL_WEB_DIR}/node_modules" ]]; then + print_err "请先执行: cd app-eval-web && npm install" + exit 1 + fi + require_cmd "npm" + + local api_key="${INTERNAL_EVAL_API_KEY:-}" + if [[ -z "${api_key}" ]] && [[ -f "${ROOT_DIR}/.env" ]]; then + api_key="$(grep -E '^INTERNAL_EVAL_API_KEY=' "${ROOT_DIR}/.env" | head -1 | cut -d= -f2- | tr -d '\r' | sed 's/^"//;s/"$//')" + fi + if [[ -z "${api_key}" ]]; then + print_err "无法解析 INTERNAL_EVAL_API_KEY,无法为 Vite 注入 VITE_EVAL_API_KEY" + exit 1 + fi + + local vite_extra=() + if [[ "${OPEN_EVAL_WEB}" == "1" ]]; then + vite_extra+=(--open) + fi + + # 不设 VITE_EVAL_API_BASE:前端走 Vite proxy(app-eval-web/vite.config.ts)转发到 :${INTERNAL_EVAL_PORT},减少直连/CORS/误指主站问题。 + # 若需直连远端 API:export VITE_EVAL_API_BASE=https://... 后再手动 npm run dev。 + ( + cd "${EVAL_WEB_DIR}" + VITE_EVAL_API_KEY="${api_key}" \ + npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}" "${vite_extra[@]}" + ) & + EVAL_WEB_PID=$! + print_ok "eval-web 已启动 (PID: ${EVAL_WEB_PID}) → http://127.0.0.1:${EVAL_WEB_PORT}/" +} + +start_services() { + print_header "启动 Internal Eval API 与 Celery" + cd "${ROOT_DIR}" + + if command -v lsof >/dev/null 2>&1; then + if lsof -nP -iTCP:"${INTERNAL_EVAL_PORT}" -sTCP:LISTEN >/dev/null 2>&1; then + print_err "端口 ${INTERNAL_EVAL_PORT} 已被占用,无法启动内部评测 Uvicorn。" + print_err "请先结束占用进程,或设置 INTERNAL_EVAL_PORT 为其他端口" + exit 1 + fi + fi + + # 与主开发脚本一致:评审/生产 LLM 等从 .env 读取;文档默认关闭,本地可 export INTERNAL_EVAL_ENABLE_DOCS=1 + "${UVICORN_BIN}" app.internal_main:internal_app --reload \ + --reload-exclude 'alembic/**' \ + --reload-exclude 'alembic.ini' \ + --host "${INTERNAL_EVAL_HOST}" --port "${INTERNAL_EVAL_PORT}" & + API_PID=$! + print_ok "Internal Eval API 已启动 (PID: ${API_PID})" + + if [[ "${SKIP_CELERY}" != "1" ]]; then + "${CELERY_BIN}" -A app.tasks.celery_app worker --loglevel=info --pool="${CELERY_POOL}" & + CELERY_PID=$! + print_ok "Celery 已启动 (PID: ${CELERY_PID})" + else + print_warn "已跳过 Celery (SKIP_CELERY=1);实验 run 接口需要 worker 才能执行" + fi + + if [[ "${START_EVAL_WEB}" == "1" ]]; then + start_eval_web + fi + + echo + echo -e "${GREEN}内部评测环境启动完成${NC}" + echo "【请用浏览器打开】评测 Web UI: http://127.0.0.1:${EVAL_WEB_PORT}/ (/internal 会代理到 API :${INTERNAL_EVAL_PORT})" + echo "内部评测 API: http://127.0.0.1:${INTERNAL_EVAL_PORT}/health" + echo "评测 REST 前缀: http://127.0.0.1:${INTERNAL_EVAL_PORT}/internal/api/evaluation" + if [[ "${INTERNAL_EVAL_ENABLE_DOCS:-}" == "1" ]] || grep -qE '^INTERNAL_EVAL_ENABLE_DOCS=true' "${ROOT_DIR}/.env" 2>/dev/null; then + echo "API 文档: http://127.0.0.1:${INTERNAL_EVAL_PORT}/docs" + fi + echo "说明文档: api/docs/internal-eval.md" + echo "按 Ctrl+C 停止所有进程" +} + +main() { + print_header "Life Echo 内部回归评测 — 一键启动" + echo -e "${BLUE}说明:${NC} 不启动主站 API(main:app / 默认 8000);仅启动 internal_main(:${INTERNAL_EVAL_PORT})。" + echo "" + + require_cmd "uv" + + trap cleanup EXIT INT TERM + + if [[ "${SKIP_INFRA}" != "1" ]]; then + require_cmd "docker" + start_infra + wait_postgres_ready || true + else + print_warn "已跳过 docker 基础设施 (SKIP_INFRA=1)" + fi + + ensure_venv + ensure_dotenv_from_development + check_env_file + check_internal_eval_key + run_migrations + start_services + + local wait_pids=("${API_PID}") + if [[ "${SKIP_CELERY}" != "1" ]]; then + wait_pids+=("${CELERY_PID}") + fi + if [[ "${START_EVAL_WEB}" == "1" ]] && [[ -n "${EVAL_WEB_PID}" ]]; then + wait_pids+=("${EVAL_WEB_PID}") + fi + wait "${wait_pids[@]}" +} + +main "$@" diff --git a/api/tests/evaluation/test_importers.py b/api/tests/evaluation/test_importers.py index 2c0bb13..329f4ee 100644 --- a/api/tests/evaluation/test_importers.py +++ b/api/tests/evaluation/test_importers.py @@ -1,5 +1,10 @@ from app.features.evaluation.importers.script_json import parse_script_json +from pathlib import Path + +import pytest + from app.features.evaluation.importers.user_export_markdown import ( + extract_dialogue_turns_from_export_md, extract_user_utterances_from_export_md, ) @@ -27,3 +32,43 @@ hello hi """ assert extract_user_utterances_from_export_md(md) == ["hello"] + + +def test_extract_dialogue_turns_from_export_md() -> None: + md = """ +#### 轮次 1 — x + +**用户:** + +u1 + +**AI:** + +a1 + +#### 轮次 2 — y + +**用户:** + +u2 + +**AI:** + +a2 +""" + turns = extract_dialogue_turns_from_export_md(md) + assert turns == [("u1", "a1"), ("u2", "a2")] + + +def test_extract_dialogue_turns_from_repo_user_export() -> None: + p = ( + Path(__file__).resolve().parents[1] + / "user_exports" + / "13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md" + ) + if not p.is_file(): + pytest.skip("user export fixture not present") + text = p.read_text(encoding="utf-8") + turns = extract_dialogue_turns_from_export_md(text) + assert len(turns) >= 5 + assert "你好" in turns[0][0] diff --git a/app-eval-web/src/App.tsx b/app-eval-web/src/App.tsx index 8c906e5..ee4ebbd 100644 --- a/app-eval-web/src/App.tsx +++ b/app-eval-web/src/App.tsx @@ -1,14 +1,41 @@ import { useCallback, useEffect, useState } from "react"; +const envApiBase = ( + import.meta.env.VITE_EVAL_API_BASE as string | undefined +)?.trim() ?? ""; +/** + * 开发 + 未设 VITE_EVAL_API_BASE:用相对路径走 Vite proxy → :8001(见 vite.config.ts)。 + * 生产构建未配 env 时仍回退直连 8001。 + */ const apiBase = - import.meta.env.VITE_EVAL_API_BASE ?? "http://127.0.0.1:8001"; -const apiKey = import.meta.env.VITE_EVAL_API_KEY ?? ""; + envApiBase || (import.meta.env.DEV ? "" : "http://127.0.0.1:8001"); +const apiKey = + (import.meta.env.VITE_EVAL_API_KEY as string | undefined)?.trim() ?? ""; + +const apiBaseHint = + apiBase === "" + ? "(开发)请求经 Vite 代理到 http://127.0.0.1:8001" + : `直连 ${apiBase}`; + +/** 首页会话列表轮询 */ +const SESSION_LIST_POLL_MS = 4000; +/** 对比页左侧线上对话轮询 */ +const DIALOGUE_POLL_MS = 3500; +/** 对比页右侧 GLM / 评测 run 轮询 */ +const SESSION_EVAL_POLL_MS = 8000; +/** 高级页回归集 / 实验列表轮询 */ +const ADMIN_POLL_MS = 8000; + +/** 默认对照用导出快照(api/tests/user_exports/) */ +const DEFAULT_USER_EXPORT_FIXTURE = + "13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md"; async function api( path: string, init?: RequestInit, ): Promise<{ ok: boolean; data?: T; error?: string; status: number }> { - const r = await fetch(`${apiBase}${path}`, { + const url = `${apiBase}${path.startsWith("/") ? path : `/${path}`}`; + const r = await fetch(url, { ...init, headers: { "X-Internal-Eval-Key": apiKey, @@ -27,9 +54,13 @@ async function api( return { ok: false, status: r.status, - error: typeof data === "object" && data && "detail" in (data as object) - ? String((data as { detail: unknown }).detail) - : text || r.statusText, + error: + typeof data === "object" && + data && + "detail" in (data as object) && + data !== null + ? String((data as unknown as { detail: unknown }).detail) + : text || r.statusText, }; } return { ok: true, data, status: r.status }; @@ -38,103 +69,351 @@ async function api( type SessionItem = { id: string; user_id: string; + user_phone: string | null; started_at: string | null; + last_message_at: string | null; conversation_stage: string | null; current_topic: string | null; status: string | null; }; +type DialogueMessage = { + role: string; + content: string; + created_at?: string | null; +}; + +type RunTurnOut = { + id: string; + turn_index: number; + user_utterance: string; + assistant_reply: string | null; + duration_ms: number | null; + judge_scores_json: Record | null; + judge_rationale: string | null; +}; + +type EvalRunOut = { + id: string; + experiment_id: string; + case_id: string; + side: string; + status: string; + error_message: string | null; + memoir_markdown: string | null; + conversation_score_total: number | null; + memoir_score_total: number | null; + composite_score: number | null; + judge_bundle_json: Record | null; + turns: RunTurnOut[]; +}; + +type SessionEvalRunItem = { + experiment_name: string; + run: EvalRunOut; +}; + +function fmtScore(n: unknown): string { + if (typeof n === "number" && !Number.isNaN(n)) return n.toFixed(1); + return "—"; +} + +function JsonPreview({ value }: { value: unknown }) { + if (value == null) return ; + return ( +
+      {JSON.stringify(value, null, 2)}
+    
+ ); +} + +const shell: React.CSSProperties = { + minHeight: "100vh", + background: "#0f1419", + color: "#e6edf3", + fontFamily: + 'ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, sans-serif', +}; + +const btn: React.CSSProperties = { + padding: "8px 14px", + borderRadius: 8, + border: "1px solid #30363d", + background: "#21262d", + color: "#e6edf3", + cursor: "pointer", + fontSize: 14, +}; + +const btnPrimary: React.CSSProperties = { + ...btn, + background: "#238636", + borderColor: "#238636", +}; + +function formatTime(iso: string | null | undefined) { + if (!iso) return "—"; + try { + const d = new Date(iso); + return d.toLocaleString(); + } catch { + return iso; + } +} + export default function App() { - const [tab, setTab] = useState< - "sessions" | "sets" | "versions" | "experiments" - >("sessions"); + const [view, setView] = useState<"home" | "session" | "admin">("home"); const [msg, setMsg] = useState(""); const [sessions, setSessions] = useState([]); - const [sets, setSets] = useState<{ id: string; name: string }[]>([]); + const [selectedId, setSelectedId] = useState(null); + + const [dialogue, setDialogue] = useState([]); + const [fallbackUserLines, setFallbackUserLines] = useState([]); + const [loadingLeft, setLoadingLeft] = useState(false); + const [versions, setVersions] = useState<{ id: string; name: string }[]>([]); + + const [sessionEvalItems, setSessionEvalItems] = useState( + [], + ); + const [sessionEvalUpdatedAt, setSessionEvalUpdatedAt] = useState( + null, + ); + + const [adminTab, setAdminTab] = useState< + "sets" | "versions" | "experiments" + >("experiments"); + const [sets, setSets] = useState<{ id: string; name: string }[]>([]); const [experiments, setExperiments] = useState< { id: string; name: string; status: string }[] >([]); const [selSet, setSelSet] = useState(""); const [newSetName, setNewSetName] = useState("默认回归集"); - const [newVerName, setNewVerName] = useState("baseline"); + const [newVerName, setNewVerName] = useState("candidate-v1"); const [verConfig, setVerConfig] = useState("{}"); const [selExp, setSelExp] = useState(null); const [expDetail, setExpDetail] = useState(null); - const [streamLog, setStreamLog] = useState([]); + const [enqueueingExpId, setEnqueueingExpId] = useState(null); + const [evalReachable, setEvalReachable] = useState< + "unknown" | "ok" | "bad" + >("unknown"); + const [sessionsUpdatedAt, setSessionsUpdatedAt] = useState(null); + const [dialogueUpdatedAt, setDialogueUpdatedAt] = useState(null); - const refreshSessions = useCallback(async () => { - const r = await api<{ items: SessionItem[]; total: number }>( - "/internal/api/evaluation/sessions?limit=30", - ); - if (r.ok && r.data) setSessions(r.data.items); - else setMsg(r.error ?? "sessions failed"); - }, []); + const [fixtureFiles, setFixtureFiles] = useState([]); + const [fixtureName, setFixtureName] = useState(""); + const [fixtureTurns, setFixtureTurns] = useState< + { user: string; ai: string }[] + >([]); - const refreshSets = useCallback(async () => { - const r = await api<{ id: string; name: string }[]>( - "/internal/api/evaluation/regression-sets", - ); + /** 近期全部:含已结束会话;仅进行中:status=active(多数字段在用户挂断后为 ended,列表会空) */ + const [sessionFilter, setSessionFilter] = useState<"recent" | "active">( + "recent", + ); + + const refreshSessionList = useCallback(async () => { + const path = + sessionFilter === "active" + ? "/internal/api/evaluation/sessions?status=active&limit=80" + : "/internal/api/evaluation/sessions?limit=80"; + const r = await api<{ items: SessionItem[]; total: number }>(path); if (r.ok && r.data) { - setSets(r.data); - if (!selSet && r.data[0]) setSelSet(r.data[0].id); - } else setMsg(r.error ?? "sets failed"); - }, [selSet]); + setSessions(r.data.items); + setSessionsUpdatedAt(new Date()); + setMsg(""); + } else { + const hint = + r.status === 404 + ? `找不到接口 (404)。请在终端执行: curl -s http://127.0.0.1:8001/internal/api/evaluation/ping (应返回 {"ok":true,...})。若此处也 404,说明 8001 上不是 internal_main。${apiBaseHint};也可删掉 VITE_EVAL_API_BASE 仅用代理。` + : (r.error ?? "加载会话失败"); + setMsg(hint); + } + }, [sessionFilter]); const refreshVersions = useCallback(async () => { const r = await api<{ id: string; name: string }[]>( "/internal/api/evaluation/versions", ); if (r.ok && r.data) setVersions(r.data); - else setMsg(r.error ?? "versions failed"); }, []); - const refreshExperiments = useCallback(async () => { - const r = await api<{ id: string; name: string; status: string }[]>( + const pullSessionEvalRuns = useCallback(async (conversationId: string) => { + const r = await api<{ items: SessionEvalRunItem[] }>( + `/internal/api/evaluation/sessions/${conversationId}/evaluation-runs`, + ); + if (r.ok && r.data?.items) setSessionEvalItems(r.data.items); + else setSessionEvalItems([]); + setSessionEvalUpdatedAt(new Date()); + }, []); + + const pullDialogue = useCallback(async (conversationId: string) => { + const d = await api<{ messages: DialogueMessage[] }>( + `/internal/api/evaluation/sessions/${conversationId}/dialogue`, + ); + if (d.ok && d.data?.messages?.length) { + setDialogue(d.data.messages); + setFallbackUserLines([]); + } else { + const t = await api<{ + user_utterances_from_messages: string[]; + user_utterances_from_segments: string[]; + }>(`/internal/api/evaluation/sessions/${conversationId}/transcript`); + if (t.ok && t.data) { + const lines = + t.data.user_utterances_from_messages.length > 0 + ? t.data.user_utterances_from_messages + : t.data.user_utterances_from_segments; + setDialogue([]); + setFallbackUserLines(lines); + } + } + setDialogueUpdatedAt(new Date()); + }, []); + + const loadSessionPageInitial = useCallback( + (conversationId: string) => { + setLoadingLeft(true); + setDialogue([]); + setFallbackUserLines([]); + setSessionEvalItems([]); + setSessionEvalUpdatedAt(null); + void pullDialogue(conversationId).finally(() => setLoadingLeft(false)); + }, + [pullDialogue], + ); + + const refreshAdminData = useCallback(async () => { + const rs = await api<{ id: string; name: string }[]>( + "/internal/api/evaluation/regression-sets", + ); + if (rs.ok && rs.data) { + const rows = rs.data; + setSets(rows); + setSelSet((cur) => { + if (cur) return cur; + return rows[0]?.id ?? ""; + }); + } + const ex = await api<{ id: string; name: string; status: string }[]>( "/internal/api/evaluation/experiments", ); - if (r.ok && r.data) setExperiments(r.data); - else setMsg(r.error ?? "experiments failed"); + if (ex.ok && ex.data) setExperiments(ex.data); + const vr = await api<{ id: string; name: string }[]>( + "/internal/api/evaluation/versions", + ); + if (vr.ok && vr.data) setVersions(vr.data); }, []); useEffect(() => { - void refreshSets(); - void refreshVersions(); - void refreshExperiments(); - }, [refreshSets, refreshVersions, refreshExperiments]); + void (async () => { + try { + const url = `${apiBase}/internal/api/evaluation/ping`; + const r = await fetch(url); + const j = (await r.json()) as { ok?: boolean; service?: string }; + setEvalReachable(r.ok && j.ok === true ? "ok" : "bad"); + } catch { + setEvalReachable("bad"); + } + })(); + void refreshSessionList(); + const t = setInterval(() => void refreshSessionList(), SESSION_LIST_POLL_MS); + return () => clearInterval(t); + }, [refreshSessionList]); useEffect(() => { - if (tab === "sessions") void refreshSessions(); - }, [tab, refreshSessions]); + void refreshVersions(); + }, [refreshVersions]); - async function createSet() { - const r = await api<{ id: string }>("/internal/api/evaluation/regression-sets", { - method: "POST", - body: JSON.stringify({ name: newSetName, description: "" }), - }); - if (r.ok) { - setMsg(`创建回归集 OK`); - await refreshSets(); - } else setMsg(r.error ?? "fail"); - } + useEffect(() => { + if (view !== "session" || !selectedId) return; + const t = setInterval(() => { + void pullDialogue(selectedId); + }, DIALOGUE_POLL_MS); + return () => clearInterval(t); + }, [view, selectedId, pullDialogue]); - async function snapshotSession(cid: string) { - if (!selSet) { - setMsg("先选择回归集"); + useEffect(() => { + if (view !== "session" || !selectedId) return; + void pullSessionEvalRuns(selectedId); + const t = setInterval(() => { + void pullSessionEvalRuns(selectedId); + }, SESSION_EVAL_POLL_MS); + return () => clearInterval(t); + }, [view, selectedId, pullSessionEvalRuns]); + + useEffect(() => { + if (view !== "session" || !selectedId) return; + void (async () => { + const r = await api<{ items: string[] }>( + "/internal/api/evaluation/fixtures/user-exports", + ); + if (!r.ok || !r.data?.items?.length) { + setFixtureFiles([]); + return; + } + const items = r.data.items; + setFixtureFiles(items); + setFixtureName((cur) => { + if (cur && items.includes(cur)) return cur; + if (items.includes(DEFAULT_USER_EXPORT_FIXTURE)) + return DEFAULT_USER_EXPORT_FIXTURE; + return items[0] ?? ""; + }); + })(); + }, [view, selectedId]); + + useEffect(() => { + if (view !== "session" || !fixtureName) { + setFixtureTurns([]); return; } - const r = await api( - `/internal/api/evaluation/regression-sets/${selSet}/snapshot-from-conversation/${cid}`, - { - method: "POST", - body: JSON.stringify({ - title: "", - use_messages: false, - is_protected: false, - }), - }, + void (async () => { + const r = await api<{ turns: { user: string; ai: string }[] }>( + `/internal/api/evaluation/fixtures/user-exports/${encodeURIComponent(fixtureName)}`, + ); + if (r.ok && r.data?.turns) setFixtureTurns(r.data.turns); + else setFixtureTurns([]); + })(); + }, [view, fixtureName]); + + useEffect(() => { + if (view !== "admin") return; + void refreshAdminData(); + const t = setInterval(() => void refreshAdminData(), ADMIN_POLL_MS); + return () => clearInterval(t); + }, [view, refreshAdminData]); + + function openSession(id: string) { + setSelectedId(id); + setView("session"); + loadSessionPageInitial(id); + } + + async function createSet() { + const r = await api<{ id: string }>( + "/internal/api/evaluation/regression-sets", + { method: "POST", body: JSON.stringify({ name: newSetName, description: "" }) }, ); - setMsg(r.ok ? `已快照 ${cid}` : r.error ?? "fail"); + setMsg(r.ok ? "回归集已创建" : r.error ?? "失败"); + if (r.ok) { + const rs = await api<{ id: string; name: string }[]>( + "/internal/api/evaluation/regression-sets", + ); + if (rs.ok && rs.data) setSets(rs.data); + } } async function createVersion() { @@ -142,7 +421,7 @@ export default function App() { try { cfg = JSON.parse(verConfig || "{}") as Record; } catch { - setMsg("config_json 不是合法 JSON"); + setMsg("config_json 无效"); return; } const r = await api<{ id: string }>("/internal/api/evaluation/versions", { @@ -153,251 +432,916 @@ export default function App() { config_json: cfg, }), }); - if (r.ok) { - setMsg("创建版本 OK"); - await refreshVersions(); - } else setMsg(r.error ?? "fail"); + setMsg(r.ok ? "版本已创建" : r.error ?? "失败"); + if (r.ok) void refreshVersions(); + } + + async function snapshotFromDetail() { + if (!selectedId || !selSet) { + setMsg("在「高级配置」中选回归集 ID"); + return; + } + const r = await api( + `/internal/api/evaluation/regression-sets/${selSet}/snapshot-from-conversation/${selectedId}`, + { + method: "POST", + body: JSON.stringify({ + title: "", + use_messages: true, + is_protected: false, + }), + }, + ); + setMsg(r.ok ? "已快照到回归集" : r.error ?? "失败"); } async function loadExp(eid: string) { setSelExp(eid); - const r = await api(`/internal/api/evaluation/experiments/${eid}`); + const r = await api( + `/internal/api/evaluation/experiments/${eid}`, + ); if (r.ok) setExpDetail(r.data); else setMsg(r.error ?? "fail"); } - async function createExperiment() { - if (versions.length < 2 || sets.length < 1) { - setMsg("至少需要 1 个回归集和 2 个版本(基线/候选)"); - return; + async function enqueueExperimentRun(eid: string) { + setEnqueueingExpId(eid); + try { + const r = await api<{ status?: string }>( + `/internal/api/evaluation/experiments/${eid}/run`, + { method: "POST" }, + ); + setMsg( + r.ok + ? "已提交 Celery 执行:回放 + GLM 评审写入各 run(需 worker 与 LLM 就绪)" + : (r.error ?? "提交失败"), + ); + if (r.ok) void refreshAdminData(); + } finally { + setEnqueueingExpId(null); } - const baselineId = versions[0]!.id; - const candidateId = versions[1]!.id; - const r = await api<{ id: string }>("/internal/api/evaluation/experiments", { - method: "POST", - body: JSON.stringify({ - name: `exp-${Date.now()}`, - regression_set_id: selSet || sets[0]!.id, - baseline_version_id: baselineId, - candidate_version_id: candidateId, - composite_weights_json: { conversation: 0.5, memoir: 0.5 }, - }), - }); - if (r.ok) { - setMsg(`实验已创建 ${(r.data as { id: string }).id}`); - await refreshExperiments(); - } else setMsg(r.error ?? "fail"); - } - - async function runExp(eid: string) { - const r = await api( - `/internal/api/evaluation/experiments/${eid}/run`, - { method: "POST", body: "{}" }, - ); - setMsg(r.ok ? `已入队 Celery: ${eid}` : r.error ?? "fail"); - } - - function subscribeStream(eid: string) { - setStreamLog([]); - if (!apiKey) { - setMsg("请配置 VITE_EVAL_API_KEY 以使用 SSE(query key)"); - return; - } - const url = `${apiBase}/internal/api/evaluation/experiments/${eid}/stream?key=${encodeURIComponent(apiKey)}`; - const es = new EventSource(url); - es.onmessage = (ev) => { - setStreamLog((prev) => [...prev.slice(-20), ev.data]); - try { - const p = JSON.parse(ev.data) as { status?: string }; - if (p.status === "completed" || p.status === "failed") { - es.close(); - void loadExp(eid); - } - } catch { - /* ignore */ - } - }; - es.onerror = () => { - setMsg("SSE 连接错误(检查内网 API / Key)"); - es.close(); - }; } return ( -
-

内部回归评测台

-

- API: {apiBase} · 配置{" "} - VITE_EVAL_API_BASE / VITE_EVAL_API_KEY -

- {msg ?

{msg}

: null} +
+
+ 回归评测台 + + {apiBaseHint} + {evalReachable === "ok" ? ( + · /ping OK + ) : evalReachable === "bad" ? ( + + · 连不上 internal /ping,请起{" "} + uvicorn app.internal_main:internal_app --port 8001 或{" "} + internal-eval.sh + + ) : null} + + · 网页是 5174;8001 仅为 API + + + + + + +
- - - {tab === "sessions" ? ( -
- - - - - - - - - - - {sessions.map((s) => ( - - - - - - - ))} - -
会话 ID用户阶段 -
- {s.id.slice(0, 8)}… - {s.user_id.slice(0, 8)}…{s.conversation_stage ?? "—"} - -
-

- 回归集 ID: - -

-
- ) : null} - - {tab === "sets" ? ( -
- setNewSetName(e.target.value)} - placeholder="回归集名称" - /> - -
    - {sets.map((s) => ( -
  • - {s.id} — {s.name}{" "} - -
  • - ))} -
-
- ) : null} - - {tab === "versions" ? ( -
- setNewVerName(e.target.value)} - placeholder="版本名(如 baseline / candidate)" - /> -