diff --git a/api/app/features/evaluation/admin_service.py b/api/app/features/evaluation/admin_service.py
index a269de7..a58130c 100644
--- a/api/app/features/evaluation/admin_service.py
+++ b/api/app/features/evaluation/admin_service.py
@@ -26,16 +26,25 @@ from app.features.evaluation.models import (
EvalRunTurn,
EvalVersion,
)
+from app.features.evaluation.presenters import run_out
from app.features.evaluation.schemas import (
CaseCreate,
ExperimentCreate,
ImportJsonCaseBody,
ImportMarkdownBody,
RegressionSetCreate,
+ SessionEvalRunItem,
+ SessionEvalRunsOut,
SnapshotFromConversationBody,
VersionCreate,
)
from app.features.evaluation.session_catalog_service import SessionCatalogService
+from app.features.evaluation.user_export_fixtures import (
+ list_user_export_fixture_names as list_user_export_md_filenames,
+)
+from app.features.evaluation.user_export_fixtures import (
+ read_user_export_fixture,
+)
from app.tasks.evaluation_tasks import run_eval_experiment_task
@@ -188,6 +197,23 @@ class EvaluationAdminService:
async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
return await eval_repo.list_experiments(self._db, limit=limit)
+ async def list_session_evaluation_runs(
+ self, conversation_id: str
+ ) -> SessionEvalRunsOut:
+ rows = await eval_repo.list_runs_for_source_conversation(
+ self._db, source_conversation_id=conversation_id
+ )
+ items: list[SessionEvalRunItem] = []
+ for run, _case, exp in rows:
+ turns = await eval_repo.list_turns(self._db, run.id)
+ items.append(
+ SessionEvalRunItem(
+ experiment_name=exp.name,
+ run=run_out(run, turns),
+ )
+ )
+ return SessionEvalRunsOut(conversation_id=conversation_id, items=items)
+
async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
if not rs:
@@ -232,7 +258,6 @@ class EvaluationAdminService:
async def experiment_stream_snapshot(
self, experiment_id: str
) -> dict[str, Any] | None:
- from app.features.evaluation.presenters import run_out
from app.features.evaluation.schemas import GateVerdictOut
exp = await eval_repo.get_experiment(self._db, experiment_id)
@@ -250,3 +275,10 @@ class EvaluationAdminService:
"runs": run_payload,
"gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
}
+
+ def list_user_export_fixture_names(self) -> list[str]:
+ return list_user_export_md_filenames()
+
+ def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]:
+ turns, _ = read_user_export_fixture(filename)
+ return turns
diff --git a/api/app/features/evaluation/execution_service.py b/api/app/features/evaluation/execution_service.py
index 39943ce..204012b 100644
--- a/api/app/features/evaluation/execution_service.py
+++ b/api/app/features/evaluation/execution_service.py
@@ -23,6 +23,17 @@ from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
logger = get_logger(__name__)
+_MAX_JUDGE_MARKDOWN_CHARS = 20_000
+_MAX_EVAL_CHAPTERS = 30
+_MAX_EVAL_STORIES = 40
+
+
+def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
+ s = (text or "").strip()
+ if len(s) <= max_chars:
+ return s
+ return f"{s[:max_chars]}\n\n…(已截断供评审)"
+
def _composite(
conv: float | None, mem: float | None, weights: dict[str, Any] | None
@@ -149,7 +160,66 @@ async def execute_eval_run(
memoir_md = simple_memoir_from_transcript(utterances, replies)
mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
- mem_total = mem_out.total_score if mem_out else None
+
+ chapter_entries: list[dict[str, Any]] = []
+ story_entries: list[dict[str, Any]] = []
+ uid = (case.source_user_id or "").strip()
+ if uid:
+ from app.features.memoir.repo import get_chapters_for_memoir_list
+ from app.features.story.repo import get_stories_for_user
+
+ try:
+ chapters = await get_chapters_for_memoir_list(
+ uid, db, active_only=True, is_new_only=None
+ )
+ for ch in chapters[:_MAX_EVAL_CHAPTERS]:
+ body = (ch.canonical_markdown or "").strip()
+ if not body:
+ continue
+ md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
+ cj = await judge.judge_memoir(memoir_markdown=md)
+ chapter_entries.append(
+ {
+ "id": ch.id,
+ "title": ch.title,
+ "order_index": ch.order_index,
+ "judge": cj.model_dump() if cj else None,
+ }
+ )
+ except Exception as e:
+ logger.warning("eval chapter judges skipped: {}", e)
+
+ try:
+ stories = await get_stories_for_user(db, uid, status="active")
+ for st in stories[:_MAX_EVAL_STORIES]:
+ body = (st.canonical_markdown or "").strip()
+ if not body:
+ continue
+ md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
+ sj = await judge.judge_memoir(memoir_markdown=md)
+ story_entries.append(
+ {
+ "id": st.id,
+ "title": st.title,
+ "stage": st.stage,
+ "judge": sj.model_dump() if sj else None,
+ }
+ )
+ except Exception as e:
+ logger.warning("eval story judges skipped: {}", e)
+
+ mem_parts: list[float] = []
+ if mem_out is not None:
+ mem_parts.append(float(mem_out.total_score))
+ for row in chapter_entries:
+ j = row.get("judge")
+ if isinstance(j, dict) and j.get("total_score") is not None:
+ mem_parts.append(float(j["total_score"]))
+ for row in story_entries:
+ j = row.get("judge")
+ if isinstance(j, dict) and j.get("total_score") is not None:
+ mem_parts.append(float(j["total_score"]))
+ mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
exp = await eval_repo.get_experiment(db, run.experiment_id)
weights = exp.composite_weights_json if exp else None
@@ -158,6 +228,8 @@ async def execute_eval_run(
bundle: dict[str, Any] = {
"conversation_judge": conv_out.model_dump() if conv_out else None,
"memoir_judge": mem_out.model_dump() if mem_out else None,
+ "chapters": chapter_entries,
+ "stories": story_entries,
}
await eval_repo.update_run(
db,
diff --git a/api/app/features/evaluation/importers/user_export_markdown.py b/api/app/features/evaluation/importers/user_export_markdown.py
index cc03580..92722cb 100644
--- a/api/app/features/evaluation/importers/user_export_markdown.py
+++ b/api/app/features/evaluation/importers/user_export_markdown.py
@@ -17,3 +17,30 @@ def extract_user_utterances_from_export_md(text: str) -> list[str]:
if chunk and chunk != "(空)":
out.append(chunk)
return out
+
+
+def extract_dialogue_turns_from_export_md(text: str) -> list[tuple[str, str]]:
+ """
+ 从 extract_sql_to_user_md 导出的 Markdown 中按「轮次」提取 (用户, AI) 对,供评测台对照。
+ """
+ chunks = re.split(r"\n####\s*轮次\s*\d+[^\n]*", text)
+ out: list[tuple[str, str]] = []
+ for chunk in chunks[1:]:
+ user_m = re.search(
+ r"\*\*用户:\*\*\s*\n+(.+?)(?=\n\*\*AI:\*\*)",
+ chunk,
+ flags=re.DOTALL | re.IGNORECASE,
+ )
+ ai_m = re.search(
+ r"\*\*AI:\*\*\s*\n+(.+?)(?=\n####\s|\n###\s+[^#]|\Z)",
+ chunk,
+ flags=re.DOTALL | re.IGNORECASE,
+ )
+ if not user_m:
+ continue
+ u = (user_m.group(1) or "").strip()
+ if not u or u == "(空)":
+ continue
+ a = ((ai_m.group(1) if ai_m else "") or "").strip()
+ out.append((u, a))
+ return out
diff --git a/api/app/features/evaluation/presenters.py b/api/app/features/evaluation/presenters.py
index 7223cea..7a655b4 100644
--- a/api/app/features/evaluation/presenters.py
+++ b/api/app/features/evaluation/presenters.py
@@ -21,5 +21,6 @@ def run_out(row, turns: list) -> EvalRunOut:
conversation_score_total=row.conversation_score_total,
memoir_score_total=row.memoir_score_total,
composite_score=row.composite_score,
+ judge_bundle_json=row.judge_bundle_json,
turns=[RunTurnOut.model_validate(t) for t in turns],
)
diff --git a/api/app/features/evaluation/repo.py b/api/app/features/evaluation/repo.py
index 70c4f0e..0e3c105 100644
--- a/api/app/features/evaluation/repo.py
+++ b/api/app/features/evaluation/repo.py
@@ -204,6 +204,27 @@ async def list_runs_for_experiment(
return list(res.scalars().all())
+async def list_runs_for_source_conversation(
+ db: AsyncSession,
+ *,
+ source_conversation_id: str,
+ limit: int = 80,
+) -> list[tuple[EvalRun, EvalCase, EvalExperiment]]:
+ stmt = (
+ select(EvalRun, EvalCase, EvalExperiment)
+ .join(EvalCase, EvalRun.case_id == EvalCase.id)
+ .join(EvalExperiment, EvalRun.experiment_id == EvalExperiment.id)
+ .where(EvalCase.source_conversation_id == source_conversation_id)
+ .order_by(
+ EvalRun.completed_at.desc().nulls_last(),
+ EvalRun.started_at.desc().nulls_last(),
+ )
+ .limit(limit)
+ )
+ res = await db.execute(stmt)
+ return list(res.all())
+
+
async def update_run(
db: AsyncSession,
run: EvalRun,
diff --git a/api/app/features/evaluation/router.py b/api/app/features/evaluation/router.py
index 848461b..2568480 100644
--- a/api/app/features/evaluation/router.py
+++ b/api/app/features/evaluation/router.py
@@ -28,10 +28,15 @@ from app.features.evaluation.schemas import (
ImportMarkdownBody,
RegressionSetCreate,
RegressionSetOut,
+ SessionDialogueOut,
+ SessionEvalRunsOut,
SessionListItem,
SessionListResponse,
SessionTranscriptOut,
SnapshotFromConversationBody,
+ UserExportFixtureDetailOut,
+ UserExportFixtureListOut,
+ UserExportFixtureTurnOut,
VersionCreate,
VersionOut,
)
@@ -40,6 +45,12 @@ from app.features.evaluation.session_catalog_service import SessionCatalogServic
router = APIRouter(tags=["internal-evaluation"])
+@router.get("/ping", include_in_schema=False)
+async def eval_api_ping() -> dict[str, str | bool]:
+ """无鉴权:确认当前进程是 internal_main 且路由已挂载。"""
+ return {"ok": True, "service": "life-echo-internal-eval"}
+
+
def _eval_http_exc(
e: EvaluationNotFoundError | EvaluationBadRequestError,
) -> HTTPException:
@@ -123,17 +134,23 @@ async def list_sessions(
limit: int = Query(50, ge=1, le=200),
user_id: str | None = Query(None),
q: str | None = Query(None),
+ status: str | None = Query(
+ None,
+ description="按会话 status 过滤,如 active",
+ ),
):
catalog = SessionCatalogService(db)
rows, total = await catalog.list_sessions(
- offset=offset, limit=limit, user_id=user_id, q=q
+ offset=offset, limit=limit, user_id=user_id, q=q, status=status
)
return SessionListResponse(
items=[
SessionListItem(
id=r.id,
user_id=r.user_id,
+ user_phone=r.user_phone,
started_at=r.started_at,
+ last_message_at=r.last_message_at,
conversation_stage=r.conversation_stage,
current_topic=r.current_topic,
status=r.status,
@@ -144,6 +161,22 @@ async def list_sessions(
)
+@router.get(
+ "/sessions/{conversation_id}/dialogue",
+ response_model=SessionDialogueOut,
+)
+async def get_session_dialogue(
+ conversation_id: str,
+ _auth: InternalEvalAuth,
+ db: Annotated[AsyncSession, Depends(get_async_db)],
+):
+ catalog = SessionCatalogService(db)
+ out = await catalog.get_session_dialogue(conversation_id)
+ if not out:
+ raise HTTPException(status_code=404, detail="conversation not found")
+ return out
+
+
@router.get(
"/sessions/{conversation_id}/transcript", response_model=SessionTranscriptOut
)
@@ -164,6 +197,52 @@ async def get_session_transcript(
)
+@router.get(
+ "/sessions/{conversation_id}/evaluation-runs",
+ response_model=SessionEvalRunsOut,
+)
+async def list_session_evaluation_runs(
+ conversation_id: str,
+ _auth: InternalEvalAuth,
+ svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
+):
+ return await svc.list_session_evaluation_runs(conversation_id)
+
+
+@router.get(
+ "/fixtures/user-exports",
+ response_model=UserExportFixtureListOut,
+)
+async def list_user_export_fixtures(
+ _auth: InternalEvalAuth,
+ svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
+):
+ return UserExportFixtureListOut(items=svc.list_user_export_fixture_names())
+
+
+@router.get(
+ "/fixtures/user-exports/{filename}",
+ response_model=UserExportFixtureDetailOut,
+)
+async def get_user_export_fixture(
+ filename: str,
+ _auth: InternalEvalAuth,
+ svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
+):
+ try:
+ turns = svc.load_user_export_fixture_turns(filename)
+ except ValueError:
+ raise HTTPException(
+ status_code=400, detail="invalid fixture filename"
+ ) from None
+ except FileNotFoundError:
+ raise HTTPException(status_code=404, detail="fixture not found") from None
+ return UserExportFixtureDetailOut(
+ filename=filename,
+ turns=[UserExportFixtureTurnOut(user=u, ai=a) for u, a in turns],
+ )
+
+
@router.post("/regression-sets/{set_id}/import-markdown", response_model=CaseOut)
async def import_markdown_case(
set_id: str,
diff --git a/api/app/features/evaluation/schemas.py b/api/app/features/evaluation/schemas.py
index 86135e3..386f68a 100644
--- a/api/app/features/evaluation/schemas.py
+++ b/api/app/features/evaluation/schemas.py
@@ -88,10 +88,25 @@ class ExperimentOut(BaseModel):
completed_at: datetime | None
+class SessionDialogueMessageOut(BaseModel):
+ model_config = ConfigDict(from_attributes=True)
+
+ role: str
+ content: str
+ created_at: datetime | None = None
+
+
+class SessionDialogueOut(BaseModel):
+ conversation_id: str
+ messages: list[SessionDialogueMessageOut]
+
+
class SessionListItem(BaseModel):
id: str
user_id: str
+ user_phone: str | None = Field(default=None, description="users.phone,列表展示用")
started_at: datetime | None
+ last_message_at: datetime | None = None
conversation_stage: str | None
current_topic: str | None
status: str | None
@@ -109,6 +124,20 @@ class SessionTranscriptOut(BaseModel):
user_utterances_from_messages: list[str]
+class UserExportFixtureTurnOut(BaseModel):
+ user: str
+ ai: str
+
+
+class UserExportFixtureListOut(BaseModel):
+ items: list[str]
+
+
+class UserExportFixtureDetailOut(BaseModel):
+ filename: str
+ turns: list[UserExportFixtureTurnOut]
+
+
class SnapshotFromConversationBody(BaseModel):
title: str | None = None
use_messages: bool = False
@@ -157,9 +186,20 @@ class EvalRunOut(BaseModel):
conversation_score_total: float | None
memoir_score_total: float | None
composite_score: float | None
+ judge_bundle_json: dict[str, Any] | None = None
turns: list[RunTurnOut] = []
+class SessionEvalRunItem(BaseModel):
+ experiment_name: str
+ run: EvalRunOut
+
+
+class SessionEvalRunsOut(BaseModel):
+ conversation_id: str
+ items: list[SessionEvalRunItem]
+
+
class GateVerdictOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
diff --git a/api/app/features/evaluation/session_catalog_repo.py b/api/app/features/evaluation/session_catalog_repo.py
index 9af2eb1..41a4e8a 100644
--- a/api/app/features/evaluation/session_catalog_repo.py
+++ b/api/app/features/evaluation/session_catalog_repo.py
@@ -4,6 +4,7 @@ from __future__ import annotations
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm import joinedload
from app.features.conversation.models import Conversation, ConversationMessage, Segment
@@ -12,12 +13,14 @@ class SessionCatalogRepo:
def __init__(self, db: AsyncSession) -> None:
self._db = db
- async def count_conversations(self) -> int:
+ async def count_conversations(self, *, status: str | None = None) -> int:
q = (
select(func.count())
.select_from(Conversation)
.where(Conversation.deleted_at.is_(None))
)
+ if status:
+ q = q.where(Conversation.status == status)
r = await self._db.execute(q)
return int(r.scalar() or 0)
@@ -28,11 +31,20 @@ class SessionCatalogRepo:
limit: int = 50,
user_id: str | None = None,
q_text: str | None = None,
+ status: str | None = None,
) -> list[Conversation]:
stmt = select(Conversation).where(Conversation.deleted_at.is_(None))
if user_id:
stmt = stmt.where(Conversation.user_id == user_id)
- stmt = stmt.order_by(Conversation.started_at.desc().nullslast())
+ if status:
+ stmt = stmt.where(Conversation.status == status)
+ if status == "active":
+ stmt = stmt.order_by(
+ Conversation.last_message_at.desc().nullslast(),
+ Conversation.started_at.desc().nullslast(),
+ )
+ else:
+ stmt = stmt.order_by(Conversation.started_at.desc().nullslast())
stmt = stmt.offset(offset).limit(limit)
# q_text: 简单按 topic 搜索(后续可扩展全文)
if q_text:
@@ -41,6 +53,7 @@ class SessionCatalogRepo:
(Conversation.current_topic.isnot(None))
& (Conversation.current_topic.ilike(like))
)
+ stmt = stmt.options(joinedload(Conversation.user))
res = await self._db.execute(stmt)
return list(res.scalars().unique().all())
diff --git a/api/app/features/evaluation/session_catalog_service.py b/api/app/features/evaluation/session_catalog_service.py
index f3d3da2..a37e153 100644
--- a/api/app/features/evaluation/session_catalog_service.py
+++ b/api/app/features/evaluation/session_catalog_service.py
@@ -6,6 +6,10 @@ from dataclasses import dataclass
from sqlalchemy.ext.asyncio import AsyncSession
+from app.features.evaluation.schemas import (
+ SessionDialogueMessageOut,
+ SessionDialogueOut,
+)
from app.features.evaluation.session_catalog_repo import SessionCatalogRepo
@@ -13,7 +17,9 @@ from app.features.evaluation.session_catalog_repo import SessionCatalogRepo
class SessionSummary:
id: str
user_id: str
+ user_phone: str | None
started_at: object | None
+ last_message_at: object | None
conversation_stage: str | None
current_topic: str | None
status: str | None
@@ -38,16 +44,23 @@ class SessionCatalogService:
limit: int = 50,
user_id: str | None = None,
q: str | None = None,
+ status: str | None = None,
) -> tuple[list[SessionSummary], int]:
- total = await self._repo.count_conversations()
+ total = await self._repo.count_conversations(status=status)
rows = await self._repo.list_conversations(
- offset=offset, limit=limit, user_id=user_id, q_text=q
+ offset=offset,
+ limit=limit,
+ user_id=user_id,
+ q_text=q,
+ status=status,
)
out = [
SessionSummary(
id=c.id,
user_id=c.user_id,
+ user_phone=c.user.phone if c.user is not None else None,
started_at=c.started_at,
+ last_message_at=c.last_message_at,
conversation_stage=c.conversation_stage,
current_topic=c.current_topic,
status=c.status,
@@ -56,6 +69,25 @@ class SessionCatalogService:
]
return out, total
+ async def get_session_dialogue(
+ self, conversation_id: str
+ ) -> SessionDialogueOut | None:
+ c = await self._repo.get_conversation(conversation_id)
+ if not c or c.deleted_at:
+ return None
+ msgs = await self._repo.list_messages_for_conversation(conversation_id)
+ return SessionDialogueOut(
+ conversation_id=conversation_id,
+ messages=[
+ SessionDialogueMessageOut(
+ role=m.role,
+ content=m.content,
+ created_at=m.created_at,
+ )
+ for m in msgs
+ ],
+ )
+
async def get_transcript(self, conversation_id: str) -> SessionTranscript | None:
c = await self._repo.get_conversation(conversation_id)
if not c or c.deleted_at:
diff --git a/api/app/features/evaluation/user_export_fixtures.py b/api/app/features/evaluation/user_export_fixtures.py
new file mode 100644
index 0000000..0ef39f3
--- /dev/null
+++ b/api/app/features/evaluation/user_export_fixtures.py
@@ -0,0 +1,36 @@
+"""只读加载 api/tests/user_exports/*.md,供内部评测台对照(非生产数据路径)。"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+from app.features.evaluation.importers.user_export_markdown import (
+ extract_dialogue_turns_from_export_md,
+)
+
+_SAFE_MD = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_.-]*\.md$")
+
+
+def user_exports_dir() -> Path:
+ # api/app/features/evaluation/user_export_fixtures.py → api/
+ return Path(__file__).resolve().parents[3] / "tests" / "user_exports"
+
+
+def list_user_export_fixture_names() -> list[str]:
+ root = user_exports_dir()
+ if not root.is_dir():
+ return []
+ return sorted(p.name for p in root.glob("*.md"))
+
+
+def read_user_export_fixture(filename: str) -> tuple[list[tuple[str, str]], str]:
+ if not _SAFE_MD.match(filename):
+ raise ValueError("invalid fixture filename")
+ root = user_exports_dir()
+ path = (root / filename).resolve()
+ if path.parent != root.resolve() or not path.is_file():
+ raise FileNotFoundError(filename)
+ text = path.read_text(encoding="utf-8")
+ turns = extract_dialogue_turns_from_export_md(text)
+ return turns, text
diff --git a/api/app/internal_main.py b/api/app/internal_main.py
index 666822c..c0c34a4 100644
--- a/api/app/internal_main.py
+++ b/api/app/internal_main.py
@@ -16,6 +16,7 @@ setup_logging()
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from app.core.config import settings
@@ -53,6 +54,27 @@ internal_app.add_middleware(
register_exception_handlers(internal_app)
+@internal_app.get("/", include_in_schema=False, response_class=HTMLResponse)
+async def internal_eval_landing():
+ """浏览器打开 :8001 根路径时提示:界面在 Vite(默认 5174),本进程仅为 API。"""
+ docs_hint = (
+ '
OpenAPI 文档 /docs
'
+ if settings.internal_eval_enable_docs
+ else "(未开启文档;设置 INTERNAL_EVAL_ENABLE_DOCS=1 后可访问 /docs)
"
+ )
+ return f"""
+内部评测 API
+
+Life Echo · 内部回归评测 API
+这里是 HTTP API(端口由启动命令决定),没有内置网页。
+浏览「回归评测台」请在仓库执行 ./internal-eval.sh 或 cd app-eval-web && npm run dev,
+在终端里打开 Vite 给出的地址(一般为 http://127.0.0.1:5174/)。
+健康检查:/health
+{docs_hint}
+会话与对比接口前缀:/internal/api/evaluation/
+"""
+
+
@internal_app.on_event("startup")
async def _startup():
import asyncio
diff --git a/api/docs/internal-eval.md b/api/docs/internal-eval.md
index c34e967..25b80b1 100644
--- a/api/docs/internal-eval.md
+++ b/api/docs/internal-eval.md
@@ -4,6 +4,24 @@
## 启动
+一键脚本 `internal-eval.sh` 与 `development.sh` **不是重复各启一套主站**:
+
+| | `development.sh` | `internal-eval.sh` |
+|---|------------------|---------------------|
+| HTTP | 主站 `main:app`(默认 **8000**) | 仅评测 `internal_app`(默认 **8001**) |
+| Celery | 会起一个 worker | 默认也会起一个 worker(可与下面「瘦启动」二选一) |
+
+评测分析只需要 **8001 上的 internal API**;若你已经在跑 `development.sh`(DB/Redis/主站/已有 Celery),不必再起第二份基础设施和 worker:
+
+```bash
+cd api
+chmod +x internal-eval.sh
+# 确保 .env.development 或 .env 里有 INTERNAL_EVAL_API_KEY
+SKIP_INFRA=1 SKIP_INSTALL=1 SKIP_CELERY=1 ./internal-eval.sh # 推荐:只多开 8001
+```
+
+全新机器、只跑评测栈时可直接 `./internal-eval.sh`(会起 docker、`uv sync`、迁移、8001 + Celery)。**默认会起 `app-eval-web`,并用 Vite `--open` 尝试打开浏览器**(`http://127.0.0.1:5174/`)。不要前端时设 `START_EVAL_WEB=0`;只要前端但不要弹窗时设 `OPEN_EVAL_WEB=0`。
+
数据库与主服务共用;需配置环境变量后启动专用进程:
```bash
diff --git a/api/internal-eval.sh b/api/internal-eval.sh
new file mode 100755
index 0000000..4033d7e
--- /dev/null
+++ b/api/internal-eval.sh
@@ -0,0 +1,374 @@
+#!/usr/bin/env bash
+
+# 仅启动「内部回归评测」栈(app/internal_main.py),不启动主站 consumer API。
+#
+# 与 development.sh 的区别:
+# - development.sh:main:app + Celery(通常 :8000),面向 App/主业务。
+# - internal-eval.sh:internal_app + Celery(:8001),仅评测/回放/GLM 打分/门禁。
+# 二者共用数据库与 Redis;不会拉起第二份 main:app。
+#
+# 若本机已在跑 ./development.sh,只想多开评测 HTTP(推荐,避免第二套 worker/docker):
+# SKIP_INFRA=1 SKIP_INSTALL=1 SKIP_CELERY=1 ./internal-eval.sh
+#
+# 用法:cd api && ./internal-eval.sh
+# 可选环境变量:
+# SKIP_INFRA=1 已起好 Postgres/Redis 时跳过 docker compose
+# SKIP_INSTALL=1 跳过 uv sync
+# SKIP_CELERY=1 仅起内部 API(别处已有 Celery worker 时)
+# START_EVAL_WEB=0 不起评测前端(默认会起 app-eval-web,需已 npm install)
+# OPEN_EVAL_WEB=0 起前端但不自动打开浏览器(默认 Vite --open)
+# EVAL_WEB_PORT 打印提示用,默认 5174(与 app-eval-web/vite.config.ts 一致)
+# INTERNAL_EVAL_PORT 默认 8001
+# CELERY_POOL 默认 solo(与 development.sh 一致)
+
+set -euo pipefail
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${ROOT_DIR}/.." && pwd)"
+EVAL_WEB_DIR="${REPO_ROOT}/app-eval-web"
+
+VENV_DIR="${ROOT_DIR}/.venv"
+UVICORN_BIN="${VENV_DIR}/bin/uvicorn"
+CELERY_BIN="${VENV_DIR}/bin/celery"
+
+INTERNAL_EVAL_HOST="${INTERNAL_EVAL_HOST:-0.0.0.0}"
+INTERNAL_EVAL_PORT="${INTERNAL_EVAL_PORT:-8001}"
+CELERY_POOL="${CELERY_POOL:-solo}"
+SKIP_INSTALL="${SKIP_INSTALL:-0}"
+SKIP_INFRA="${SKIP_INFRA:-0}"
+SKIP_CELERY="${SKIP_CELERY:-0}"
+START_EVAL_WEB="${START_EVAL_WEB:-1}"
+OPEN_EVAL_WEB="${OPEN_EVAL_WEB:-1}"
+EVAL_WEB_PORT="${EVAL_WEB_PORT:-5174}"
+SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-12}"
+
+API_PID=""
+CELERY_PID=""
+EVAL_WEB_PID=""
+CLEANED_UP=0
+INFRA_STARTED=0
+
+print_header() {
+ echo -e "\n${BLUE}========================================${NC}"
+ echo -e "${BLUE}$1${NC}"
+ echo -e "${BLUE}========================================${NC}"
+}
+
+print_ok() {
+ echo -e "${GREEN}✓ $1${NC}"
+}
+
+print_warn() {
+ echo -e "${YELLOW}⚠ $1${NC}"
+}
+
+print_err() {
+ echo -e "${RED}✗ $1${NC}"
+}
+
+is_pid_alive() {
+ local pid="$1"
+ [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null
+}
+
+wait_pid_exit() {
+ local pid="$1"
+ local timeout="$2"
+ local waited=0
+
+ while is_pid_alive "${pid}"; do
+ if (( waited >= timeout )); then
+ return 1
+ fi
+ sleep 1
+ waited=$((waited + 1))
+ done
+ return 0
+}
+
+kill_children_term() {
+ local pid="$1"
+ local children
+
+ children="$(pgrep -P "${pid}" 2>/dev/null || true)"
+ if [[ -n "${children}" ]]; then
+ while IFS= read -r child_pid; do
+ [[ -z "${child_pid}" ]] && continue
+ kill_children_term "${child_pid}"
+ kill -TERM "${child_pid}" 2>/dev/null || true
+ done <<< "${children}"
+ fi
+}
+
+stop_process_gracefully() {
+ local name="$1"
+ local pid="$2"
+ local timeout="${3:-10}"
+
+ if ! is_pid_alive "${pid}"; then
+ print_ok "${name} 已退出"
+ return 0
+ fi
+
+ print_warn "正在停止 ${name}(PID: ${pid})..."
+ kill_children_term "${pid}"
+ kill -TERM "${pid}" 2>/dev/null || true
+
+ if wait_pid_exit "${pid}" "${timeout}"; then
+ print_ok "${name} 已停止"
+ return 0
+ fi
+
+ print_warn "${name} 在 ${timeout}s 内未退出,准备强制结束"
+ kill -KILL "${pid}" 2>/dev/null || true
+ wait_pid_exit "${pid}" 3 || true
+ print_ok "${name} 已强制结束"
+}
+
+cleanup() {
+ if [[ "${CLEANED_UP}" == "1" ]]; then
+ return 0
+ fi
+ CLEANED_UP=1
+
+ print_header "正在关闭内部评测环境"
+
+ if is_pid_alive "${EVAL_WEB_PID}"; then
+ stop_process_gracefully "eval-web (Vite)" "${EVAL_WEB_PID}" "${SHUTDOWN_TIMEOUT}"
+ fi
+
+ if is_pid_alive "${API_PID}"; then
+ stop_process_gracefully "Internal Eval API" "${API_PID}" "${SHUTDOWN_TIMEOUT}"
+ fi
+
+ if is_pid_alive "${CELERY_PID}"; then
+ stop_process_gracefully "Celery" "${CELERY_PID}" "${SHUTDOWN_TIMEOUT}"
+ fi
+
+ if [[ "${INFRA_STARTED}" == "1" ]]; then
+ print_warn "正在停止 PostgreSQL / Redis 容器..."
+ (
+ cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml stop
+ ) >/dev/null 2>&1 || true
+ print_ok "PostgreSQL/Redis 容器已停止"
+ fi
+}
+
+require_cmd() {
+ local cmd="$1"
+ if ! command -v "${cmd}" >/dev/null 2>&1; then
+ print_err "未找到命令: ${cmd}"
+ exit 1
+ fi
+}
+
+start_infra() {
+ print_header "启动 PostgreSQL 和 Redis"
+ cd "${ROOT_DIR}"
+ docker compose -f docker-compose.dev.yml up -d
+ INFRA_STARTED=1
+ print_ok "基础设施已就绪"
+}
+
+wait_postgres_ready() {
+ local retries=30
+ local i=0
+ print_header "等待 PostgreSQL 就绪"
+ cd "${ROOT_DIR}"
+ while (( i < retries )); do
+ if docker compose -f docker-compose.dev.yml exec -T postgres \
+ pg_isready -U postgres >/dev/null 2>&1; then
+ print_ok "PostgreSQL 已就绪"
+ return 0
+ fi
+ sleep 1
+ i=$((i + 1))
+ done
+ print_warn "PostgreSQL 在 ${retries}s 内未就绪,迁移可能失败"
+ return 1
+}
+
+ensure_venv() {
+ print_header "检查 Python 虚拟环境"
+
+ if [[ ! -d "${VENV_DIR}" ]]; then
+ print_warn ".venv 不存在,正在创建"
+ uv venv "${VENV_DIR}"
+ fi
+
+ if [[ "${SKIP_INSTALL}" != "1" ]]; then
+ print_header "安装 Python 依赖"
+ uv sync
+ print_ok "依赖安装完成"
+ else
+ print_warn "已跳过依赖安装 (SKIP_INSTALL=1)"
+ fi
+}
+
+ensure_dotenv_from_development() {
+ print_header "准备本地 .env"
+ if [[ -f "${ROOT_DIR}/.env.development" ]]; then
+ cp "${ROOT_DIR}/.env.development" "${ROOT_DIR}/.env"
+ print_ok "已从 .env.development 同步为 .env"
+ return 0
+ fi
+ print_warn "未找到 .env.development,将使用现有 .env(若存在)"
+}
+
+check_internal_eval_key() {
+ print_header "检查内部评测密钥"
+ if [[ -f "${ROOT_DIR}/.env" ]] && grep -qE '^INTERNAL_EVAL_API_KEY=.+' "${ROOT_DIR}/.env" 2>/dev/null; then
+ print_ok "已在 .env 中配置 INTERNAL_EVAL_API_KEY"
+ return 0
+ fi
+ if [[ -n "${INTERNAL_EVAL_API_KEY:-}" ]]; then
+ print_ok "已从环境变量传入 INTERNAL_EVAL_API_KEY"
+ return 0
+ fi
+ print_err "未配置 INTERNAL_EVAL_API_KEY:内部评测接口将返回 503。"
+ print_err "请在 api/.env.development(或 .env)中加入一行,例如:"
+ print_err " INTERNAL_EVAL_API_KEY=\"your-long-random-secret\""
+ exit 1
+}
+
+check_env_file() {
+ print_header "检查环境变量文件"
+ if [[ ! -f "${ROOT_DIR}/.env" ]]; then
+ print_warn "未找到 .env,应用可能因缺少配置启动失败"
+ else
+ print_ok "检测到 .env"
+ fi
+}
+
+run_migrations() {
+ print_header "执行数据库迁移"
+ cd "${ROOT_DIR}"
+ if uv run alembic upgrade head 2>/dev/null; then
+ print_ok "Alembic 迁移已就绪"
+ else
+ print_warn "Alembic 迁移失败(可能数据库未启动或 DATABASE_URL 未配置),应用启动可能失败"
+ fi
+}
+
+start_eval_web() {
+ print_header "启动 app-eval-web (Vite)"
+ if [[ ! -d "${EVAL_WEB_DIR}" ]]; then
+ print_err "未找到 ${EVAL_WEB_DIR}"
+ exit 1
+ fi
+ if [[ ! -d "${EVAL_WEB_DIR}/node_modules" ]]; then
+ print_err "请先执行: cd app-eval-web && npm install"
+ exit 1
+ fi
+ require_cmd "npm"
+
+ local api_key="${INTERNAL_EVAL_API_KEY:-}"
+ if [[ -z "${api_key}" ]] && [[ -f "${ROOT_DIR}/.env" ]]; then
+ api_key="$(grep -E '^INTERNAL_EVAL_API_KEY=' "${ROOT_DIR}/.env" | head -1 | cut -d= -f2- | tr -d '\r' | sed 's/^"//;s/"$//')"
+ fi
+ if [[ -z "${api_key}" ]]; then
+ print_err "无法解析 INTERNAL_EVAL_API_KEY,无法为 Vite 注入 VITE_EVAL_API_KEY"
+ exit 1
+ fi
+
+ local vite_extra=()
+ if [[ "${OPEN_EVAL_WEB}" == "1" ]]; then
+ vite_extra+=(--open)
+ fi
+
+ # 不设 VITE_EVAL_API_BASE:前端走 Vite proxy(app-eval-web/vite.config.ts)转发到 :${INTERNAL_EVAL_PORT},减少直连/CORS/误指主站问题。
+ # 若需直连远端 API:export VITE_EVAL_API_BASE=https://... 后再手动 npm run dev。
+ (
+ cd "${EVAL_WEB_DIR}"
+ VITE_EVAL_API_KEY="${api_key}" \
+ npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}" "${vite_extra[@]}"
+ ) &
+ EVAL_WEB_PID=$!
+ print_ok "eval-web 已启动 (PID: ${EVAL_WEB_PID}) → http://127.0.0.1:${EVAL_WEB_PORT}/"
+}
+
+start_services() {
+ print_header "启动 Internal Eval API 与 Celery"
+ cd "${ROOT_DIR}"
+
+ if command -v lsof >/dev/null 2>&1; then
+ if lsof -nP -iTCP:"${INTERNAL_EVAL_PORT}" -sTCP:LISTEN >/dev/null 2>&1; then
+ print_err "端口 ${INTERNAL_EVAL_PORT} 已被占用,无法启动内部评测 Uvicorn。"
+ print_err "请先结束占用进程,或设置 INTERNAL_EVAL_PORT 为其他端口"
+ exit 1
+ fi
+ fi
+
+ # 与主开发脚本一致:评审/生产 LLM 等从 .env 读取;文档默认关闭,本地可 export INTERNAL_EVAL_ENABLE_DOCS=1
+ "${UVICORN_BIN}" app.internal_main:internal_app --reload \
+ --reload-exclude 'alembic/**' \
+ --reload-exclude 'alembic.ini' \
+ --host "${INTERNAL_EVAL_HOST}" --port "${INTERNAL_EVAL_PORT}" &
+ API_PID=$!
+ print_ok "Internal Eval API 已启动 (PID: ${API_PID})"
+
+ if [[ "${SKIP_CELERY}" != "1" ]]; then
+ "${CELERY_BIN}" -A app.tasks.celery_app worker --loglevel=info --pool="${CELERY_POOL}" &
+ CELERY_PID=$!
+ print_ok "Celery 已启动 (PID: ${CELERY_PID})"
+ else
+ print_warn "已跳过 Celery (SKIP_CELERY=1);实验 run 接口需要 worker 才能执行"
+ fi
+
+ if [[ "${START_EVAL_WEB}" == "1" ]]; then
+ start_eval_web
+ fi
+
+ echo
+ echo -e "${GREEN}内部评测环境启动完成${NC}"
+ echo "【请用浏览器打开】评测 Web UI: http://127.0.0.1:${EVAL_WEB_PORT}/ (/internal 会代理到 API :${INTERNAL_EVAL_PORT})"
+ echo "内部评测 API: http://127.0.0.1:${INTERNAL_EVAL_PORT}/health"
+ echo "评测 REST 前缀: http://127.0.0.1:${INTERNAL_EVAL_PORT}/internal/api/evaluation"
+ if [[ "${INTERNAL_EVAL_ENABLE_DOCS:-}" == "1" ]] || grep -qE '^INTERNAL_EVAL_ENABLE_DOCS=true' "${ROOT_DIR}/.env" 2>/dev/null; then
+ echo "API 文档: http://127.0.0.1:${INTERNAL_EVAL_PORT}/docs"
+ fi
+ echo "说明文档: api/docs/internal-eval.md"
+ echo "按 Ctrl+C 停止所有进程"
+}
+
+main() {
+ print_header "Life Echo 内部回归评测 — 一键启动"
+ echo -e "${BLUE}说明:${NC} 不启动主站 API(main:app / 默认 8000);仅启动 internal_main(:${INTERNAL_EVAL_PORT})。"
+ echo ""
+
+ require_cmd "uv"
+
+ trap cleanup EXIT INT TERM
+
+ if [[ "${SKIP_INFRA}" != "1" ]]; then
+ require_cmd "docker"
+ start_infra
+ wait_postgres_ready || true
+ else
+ print_warn "已跳过 docker 基础设施 (SKIP_INFRA=1)"
+ fi
+
+ ensure_venv
+ ensure_dotenv_from_development
+ check_env_file
+ check_internal_eval_key
+ run_migrations
+ start_services
+
+ local wait_pids=("${API_PID}")
+ if [[ "${SKIP_CELERY}" != "1" ]]; then
+ wait_pids+=("${CELERY_PID}")
+ fi
+ if [[ "${START_EVAL_WEB}" == "1" ]] && [[ -n "${EVAL_WEB_PID}" ]]; then
+ wait_pids+=("${EVAL_WEB_PID}")
+ fi
+ wait "${wait_pids[@]}"
+}
+
+main "$@"
diff --git a/api/tests/evaluation/test_importers.py b/api/tests/evaluation/test_importers.py
index 2c0bb13..329f4ee 100644
--- a/api/tests/evaluation/test_importers.py
+++ b/api/tests/evaluation/test_importers.py
@@ -1,5 +1,10 @@
from app.features.evaluation.importers.script_json import parse_script_json
+from pathlib import Path
+
+import pytest
+
from app.features.evaluation.importers.user_export_markdown import (
+ extract_dialogue_turns_from_export_md,
extract_user_utterances_from_export_md,
)
@@ -27,3 +32,43 @@ hello
hi
"""
assert extract_user_utterances_from_export_md(md) == ["hello"]
+
+
+def test_extract_dialogue_turns_from_export_md() -> None:
+ md = """
+#### 轮次 1 — x
+
+**用户:**
+
+u1
+
+**AI:**
+
+a1
+
+#### 轮次 2 — y
+
+**用户:**
+
+u2
+
+**AI:**
+
+a2
+"""
+ turns = extract_dialogue_turns_from_export_md(md)
+ assert turns == [("u1", "a1"), ("u2", "a2")]
+
+
+def test_extract_dialogue_turns_from_repo_user_export() -> None:
+ p = (
+ Path(__file__).resolve().parents[1]
+ / "user_exports"
+ / "13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md"
+ )
+ if not p.is_file():
+ pytest.skip("user export fixture not present")
+ text = p.read_text(encoding="utf-8")
+ turns = extract_dialogue_turns_from_export_md(text)
+ assert len(turns) >= 5
+ assert "你好" in turns[0][0]
diff --git a/app-eval-web/src/App.tsx b/app-eval-web/src/App.tsx
index 8c906e5..ee4ebbd 100644
--- a/app-eval-web/src/App.tsx
+++ b/app-eval-web/src/App.tsx
@@ -1,14 +1,41 @@
import { useCallback, useEffect, useState } from "react";
+const envApiBase = (
+ import.meta.env.VITE_EVAL_API_BASE as string | undefined
+)?.trim() ?? "";
+/**
+ * 开发 + 未设 VITE_EVAL_API_BASE:用相对路径走 Vite proxy → :8001(见 vite.config.ts)。
+ * 生产构建未配 env 时仍回退直连 8001。
+ */
const apiBase =
- import.meta.env.VITE_EVAL_API_BASE ?? "http://127.0.0.1:8001";
-const apiKey = import.meta.env.VITE_EVAL_API_KEY ?? "";
+ envApiBase || (import.meta.env.DEV ? "" : "http://127.0.0.1:8001");
+const apiKey =
+ (import.meta.env.VITE_EVAL_API_KEY as string | undefined)?.trim() ?? "";
+
+const apiBaseHint =
+ apiBase === ""
+ ? "(开发)请求经 Vite 代理到 http://127.0.0.1:8001"
+ : `直连 ${apiBase}`;
+
+/** 首页会话列表轮询 */
+const SESSION_LIST_POLL_MS = 4000;
+/** 对比页左侧线上对话轮询 */
+const DIALOGUE_POLL_MS = 3500;
+/** 对比页右侧 GLM / 评测 run 轮询 */
+const SESSION_EVAL_POLL_MS = 8000;
+/** 高级页回归集 / 实验列表轮询 */
+const ADMIN_POLL_MS = 8000;
+
+/** 默认对照用导出快照(api/tests/user_exports/) */
+const DEFAULT_USER_EXPORT_FIXTURE =
+ "13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md";
async function api(
path: string,
init?: RequestInit,
): Promise<{ ok: boolean; data?: T; error?: string; status: number }> {
- const r = await fetch(`${apiBase}${path}`, {
+ const url = `${apiBase}${path.startsWith("/") ? path : `/${path}`}`;
+ const r = await fetch(url, {
...init,
headers: {
"X-Internal-Eval-Key": apiKey,
@@ -27,9 +54,13 @@ async function api(
return {
ok: false,
status: r.status,
- error: typeof data === "object" && data && "detail" in (data as object)
- ? String((data as { detail: unknown }).detail)
- : text || r.statusText,
+ error:
+ typeof data === "object" &&
+ data &&
+ "detail" in (data as object) &&
+ data !== null
+ ? String((data as unknown as { detail: unknown }).detail)
+ : text || r.statusText,
};
}
return { ok: true, data, status: r.status };
@@ -38,103 +69,351 @@ async function api(
type SessionItem = {
id: string;
user_id: string;
+ user_phone: string | null;
started_at: string | null;
+ last_message_at: string | null;
conversation_stage: string | null;
current_topic: string | null;
status: string | null;
};
+type DialogueMessage = {
+ role: string;
+ content: string;
+ created_at?: string | null;
+};
+
+type RunTurnOut = {
+ id: string;
+ turn_index: number;
+ user_utterance: string;
+ assistant_reply: string | null;
+ duration_ms: number | null;
+ judge_scores_json: Record | null;
+ judge_rationale: string | null;
+};
+
+type EvalRunOut = {
+ id: string;
+ experiment_id: string;
+ case_id: string;
+ side: string;
+ status: string;
+ error_message: string | null;
+ memoir_markdown: string | null;
+ conversation_score_total: number | null;
+ memoir_score_total: number | null;
+ composite_score: number | null;
+ judge_bundle_json: Record | null;
+ turns: RunTurnOut[];
+};
+
+type SessionEvalRunItem = {
+ experiment_name: string;
+ run: EvalRunOut;
+};
+
+function fmtScore(n: unknown): string {
+ if (typeof n === "number" && !Number.isNaN(n)) return n.toFixed(1);
+ return "—";
+}
+
+function JsonPreview({ value }: { value: unknown }) {
+ if (value == null) return —;
+ return (
+
+ {JSON.stringify(value, null, 2)}
+
+ );
+}
+
+const shell: React.CSSProperties = {
+ minHeight: "100vh",
+ background: "#0f1419",
+ color: "#e6edf3",
+ fontFamily:
+ 'ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, sans-serif',
+};
+
+const btn: React.CSSProperties = {
+ padding: "8px 14px",
+ borderRadius: 8,
+ border: "1px solid #30363d",
+ background: "#21262d",
+ color: "#e6edf3",
+ cursor: "pointer",
+ fontSize: 14,
+};
+
+const btnPrimary: React.CSSProperties = {
+ ...btn,
+ background: "#238636",
+ borderColor: "#238636",
+};
+
+function formatTime(iso: string | null | undefined) {
+ if (!iso) return "—";
+ try {
+ const d = new Date(iso);
+ return d.toLocaleString();
+ } catch {
+ return iso;
+ }
+}
+
export default function App() {
- const [tab, setTab] = useState<
- "sessions" | "sets" | "versions" | "experiments"
- >("sessions");
+ const [view, setView] = useState<"home" | "session" | "admin">("home");
const [msg, setMsg] = useState("");
const [sessions, setSessions] = useState([]);
- const [sets, setSets] = useState<{ id: string; name: string }[]>([]);
+ const [selectedId, setSelectedId] = useState(null);
+
+ const [dialogue, setDialogue] = useState([]);
+ const [fallbackUserLines, setFallbackUserLines] = useState([]);
+ const [loadingLeft, setLoadingLeft] = useState(false);
+
const [versions, setVersions] = useState<{ id: string; name: string }[]>([]);
+
+ const [sessionEvalItems, setSessionEvalItems] = useState(
+ [],
+ );
+ const [sessionEvalUpdatedAt, setSessionEvalUpdatedAt] = useState(
+ null,
+ );
+
+ const [adminTab, setAdminTab] = useState<
+ "sets" | "versions" | "experiments"
+ >("experiments");
+ const [sets, setSets] = useState<{ id: string; name: string }[]>([]);
const [experiments, setExperiments] = useState<
{ id: string; name: string; status: string }[]
>([]);
const [selSet, setSelSet] = useState("");
const [newSetName, setNewSetName] = useState("默认回归集");
- const [newVerName, setNewVerName] = useState("baseline");
+ const [newVerName, setNewVerName] = useState("candidate-v1");
const [verConfig, setVerConfig] = useState("{}");
const [selExp, setSelExp] = useState(null);
const [expDetail, setExpDetail] = useState(null);
- const [streamLog, setStreamLog] = useState([]);
+ const [enqueueingExpId, setEnqueueingExpId] = useState(null);
+ const [evalReachable, setEvalReachable] = useState<
+ "unknown" | "ok" | "bad"
+ >("unknown");
+ const [sessionsUpdatedAt, setSessionsUpdatedAt] = useState(null);
+ const [dialogueUpdatedAt, setDialogueUpdatedAt] = useState(null);
- const refreshSessions = useCallback(async () => {
- const r = await api<{ items: SessionItem[]; total: number }>(
- "/internal/api/evaluation/sessions?limit=30",
- );
- if (r.ok && r.data) setSessions(r.data.items);
- else setMsg(r.error ?? "sessions failed");
- }, []);
+ const [fixtureFiles, setFixtureFiles] = useState([]);
+ const [fixtureName, setFixtureName] = useState("");
+ const [fixtureTurns, setFixtureTurns] = useState<
+ { user: string; ai: string }[]
+ >([]);
- const refreshSets = useCallback(async () => {
- const r = await api<{ id: string; name: string }[]>(
- "/internal/api/evaluation/regression-sets",
- );
+ /** 近期全部:含已结束会话;仅进行中:status=active(多数字段在用户挂断后为 ended,列表会空) */
+ const [sessionFilter, setSessionFilter] = useState<"recent" | "active">(
+ "recent",
+ );
+
+ const refreshSessionList = useCallback(async () => {
+ const path =
+ sessionFilter === "active"
+ ? "/internal/api/evaluation/sessions?status=active&limit=80"
+ : "/internal/api/evaluation/sessions?limit=80";
+ const r = await api<{ items: SessionItem[]; total: number }>(path);
if (r.ok && r.data) {
- setSets(r.data);
- if (!selSet && r.data[0]) setSelSet(r.data[0].id);
- } else setMsg(r.error ?? "sets failed");
- }, [selSet]);
+ setSessions(r.data.items);
+ setSessionsUpdatedAt(new Date());
+ setMsg("");
+ } else {
+ const hint =
+ r.status === 404
+ ? `找不到接口 (404)。请在终端执行: curl -s http://127.0.0.1:8001/internal/api/evaluation/ping (应返回 {"ok":true,...})。若此处也 404,说明 8001 上不是 internal_main。${apiBaseHint};也可删掉 VITE_EVAL_API_BASE 仅用代理。`
+ : (r.error ?? "加载会话失败");
+ setMsg(hint);
+ }
+ }, [sessionFilter]);
const refreshVersions = useCallback(async () => {
const r = await api<{ id: string; name: string }[]>(
"/internal/api/evaluation/versions",
);
if (r.ok && r.data) setVersions(r.data);
- else setMsg(r.error ?? "versions failed");
}, []);
- const refreshExperiments = useCallback(async () => {
- const r = await api<{ id: string; name: string; status: string }[]>(
+ const pullSessionEvalRuns = useCallback(async (conversationId: string) => {
+ const r = await api<{ items: SessionEvalRunItem[] }>(
+ `/internal/api/evaluation/sessions/${conversationId}/evaluation-runs`,
+ );
+ if (r.ok && r.data?.items) setSessionEvalItems(r.data.items);
+ else setSessionEvalItems([]);
+ setSessionEvalUpdatedAt(new Date());
+ }, []);
+
+ const pullDialogue = useCallback(async (conversationId: string) => {
+ const d = await api<{ messages: DialogueMessage[] }>(
+ `/internal/api/evaluation/sessions/${conversationId}/dialogue`,
+ );
+ if (d.ok && d.data?.messages?.length) {
+ setDialogue(d.data.messages);
+ setFallbackUserLines([]);
+ } else {
+ const t = await api<{
+ user_utterances_from_messages: string[];
+ user_utterances_from_segments: string[];
+ }>(`/internal/api/evaluation/sessions/${conversationId}/transcript`);
+ if (t.ok && t.data) {
+ const lines =
+ t.data.user_utterances_from_messages.length > 0
+ ? t.data.user_utterances_from_messages
+ : t.data.user_utterances_from_segments;
+ setDialogue([]);
+ setFallbackUserLines(lines);
+ }
+ }
+ setDialogueUpdatedAt(new Date());
+ }, []);
+
+ const loadSessionPageInitial = useCallback(
+ (conversationId: string) => {
+ setLoadingLeft(true);
+ setDialogue([]);
+ setFallbackUserLines([]);
+ setSessionEvalItems([]);
+ setSessionEvalUpdatedAt(null);
+ void pullDialogue(conversationId).finally(() => setLoadingLeft(false));
+ },
+ [pullDialogue],
+ );
+
+ const refreshAdminData = useCallback(async () => {
+ const rs = await api<{ id: string; name: string }[]>(
+ "/internal/api/evaluation/regression-sets",
+ );
+ if (rs.ok && rs.data) {
+ const rows = rs.data;
+ setSets(rows);
+ setSelSet((cur) => {
+ if (cur) return cur;
+ return rows[0]?.id ?? "";
+ });
+ }
+ const ex = await api<{ id: string; name: string; status: string }[]>(
"/internal/api/evaluation/experiments",
);
- if (r.ok && r.data) setExperiments(r.data);
- else setMsg(r.error ?? "experiments failed");
+ if (ex.ok && ex.data) setExperiments(ex.data);
+ const vr = await api<{ id: string; name: string }[]>(
+ "/internal/api/evaluation/versions",
+ );
+ if (vr.ok && vr.data) setVersions(vr.data);
}, []);
useEffect(() => {
- void refreshSets();
- void refreshVersions();
- void refreshExperiments();
- }, [refreshSets, refreshVersions, refreshExperiments]);
+ void (async () => {
+ try {
+ const url = `${apiBase}/internal/api/evaluation/ping`;
+ const r = await fetch(url);
+ const j = (await r.json()) as { ok?: boolean; service?: string };
+ setEvalReachable(r.ok && j.ok === true ? "ok" : "bad");
+ } catch {
+ setEvalReachable("bad");
+ }
+ })();
+ void refreshSessionList();
+ const t = setInterval(() => void refreshSessionList(), SESSION_LIST_POLL_MS);
+ return () => clearInterval(t);
+ }, [refreshSessionList]);
useEffect(() => {
- if (tab === "sessions") void refreshSessions();
- }, [tab, refreshSessions]);
+ void refreshVersions();
+ }, [refreshVersions]);
- async function createSet() {
- const r = await api<{ id: string }>("/internal/api/evaluation/regression-sets", {
- method: "POST",
- body: JSON.stringify({ name: newSetName, description: "" }),
- });
- if (r.ok) {
- setMsg(`创建回归集 OK`);
- await refreshSets();
- } else setMsg(r.error ?? "fail");
- }
+ useEffect(() => {
+ if (view !== "session" || !selectedId) return;
+ const t = setInterval(() => {
+ void pullDialogue(selectedId);
+ }, DIALOGUE_POLL_MS);
+ return () => clearInterval(t);
+ }, [view, selectedId, pullDialogue]);
- async function snapshotSession(cid: string) {
- if (!selSet) {
- setMsg("先选择回归集");
+ useEffect(() => {
+ if (view !== "session" || !selectedId) return;
+ void pullSessionEvalRuns(selectedId);
+ const t = setInterval(() => {
+ void pullSessionEvalRuns(selectedId);
+ }, SESSION_EVAL_POLL_MS);
+ return () => clearInterval(t);
+ }, [view, selectedId, pullSessionEvalRuns]);
+
+ useEffect(() => {
+ if (view !== "session" || !selectedId) return;
+ void (async () => {
+ const r = await api<{ items: string[] }>(
+ "/internal/api/evaluation/fixtures/user-exports",
+ );
+ if (!r.ok || !r.data?.items?.length) {
+ setFixtureFiles([]);
+ return;
+ }
+ const items = r.data.items;
+ setFixtureFiles(items);
+ setFixtureName((cur) => {
+ if (cur && items.includes(cur)) return cur;
+ if (items.includes(DEFAULT_USER_EXPORT_FIXTURE))
+ return DEFAULT_USER_EXPORT_FIXTURE;
+ return items[0] ?? "";
+ });
+ })();
+ }, [view, selectedId]);
+
+ useEffect(() => {
+ if (view !== "session" || !fixtureName) {
+ setFixtureTurns([]);
return;
}
- const r = await api(
- `/internal/api/evaluation/regression-sets/${selSet}/snapshot-from-conversation/${cid}`,
- {
- method: "POST",
- body: JSON.stringify({
- title: "",
- use_messages: false,
- is_protected: false,
- }),
- },
+ void (async () => {
+ const r = await api<{ turns: { user: string; ai: string }[] }>(
+ `/internal/api/evaluation/fixtures/user-exports/${encodeURIComponent(fixtureName)}`,
+ );
+ if (r.ok && r.data?.turns) setFixtureTurns(r.data.turns);
+ else setFixtureTurns([]);
+ })();
+ }, [view, fixtureName]);
+
+ useEffect(() => {
+ if (view !== "admin") return;
+ void refreshAdminData();
+ const t = setInterval(() => void refreshAdminData(), ADMIN_POLL_MS);
+ return () => clearInterval(t);
+ }, [view, refreshAdminData]);
+
+ function openSession(id: string) {
+ setSelectedId(id);
+ setView("session");
+ loadSessionPageInitial(id);
+ }
+
+ async function createSet() {
+ const r = await api<{ id: string }>(
+ "/internal/api/evaluation/regression-sets",
+ { method: "POST", body: JSON.stringify({ name: newSetName, description: "" }) },
);
- setMsg(r.ok ? `已快照 ${cid}` : r.error ?? "fail");
+ setMsg(r.ok ? "回归集已创建" : r.error ?? "失败");
+ if (r.ok) {
+ const rs = await api<{ id: string; name: string }[]>(
+ "/internal/api/evaluation/regression-sets",
+ );
+ if (rs.ok && rs.data) setSets(rs.data);
+ }
}
async function createVersion() {
@@ -142,7 +421,7 @@ export default function App() {
try {
cfg = JSON.parse(verConfig || "{}") as Record;
} catch {
- setMsg("config_json 不是合法 JSON");
+ setMsg("config_json 无效");
return;
}
const r = await api<{ id: string }>("/internal/api/evaluation/versions", {
@@ -153,251 +432,916 @@ export default function App() {
config_json: cfg,
}),
});
- if (r.ok) {
- setMsg("创建版本 OK");
- await refreshVersions();
- } else setMsg(r.error ?? "fail");
+ setMsg(r.ok ? "版本已创建" : r.error ?? "失败");
+ if (r.ok) void refreshVersions();
+ }
+
+ async function snapshotFromDetail() {
+ if (!selectedId || !selSet) {
+ setMsg("在「高级配置」中选回归集 ID");
+ return;
+ }
+ const r = await api(
+ `/internal/api/evaluation/regression-sets/${selSet}/snapshot-from-conversation/${selectedId}`,
+ {
+ method: "POST",
+ body: JSON.stringify({
+ title: "",
+ use_messages: true,
+ is_protected: false,
+ }),
+ },
+ );
+ setMsg(r.ok ? "已快照到回归集" : r.error ?? "失败");
}
async function loadExp(eid: string) {
setSelExp(eid);
- const r = await api(`/internal/api/evaluation/experiments/${eid}`);
+ const r = await api(
+ `/internal/api/evaluation/experiments/${eid}`,
+ );
if (r.ok) setExpDetail(r.data);
else setMsg(r.error ?? "fail");
}
- async function createExperiment() {
- if (versions.length < 2 || sets.length < 1) {
- setMsg("至少需要 1 个回归集和 2 个版本(基线/候选)");
- return;
+ async function enqueueExperimentRun(eid: string) {
+ setEnqueueingExpId(eid);
+ try {
+ const r = await api<{ status?: string }>(
+ `/internal/api/evaluation/experiments/${eid}/run`,
+ { method: "POST" },
+ );
+ setMsg(
+ r.ok
+ ? "已提交 Celery 执行:回放 + GLM 评审写入各 run(需 worker 与 LLM 就绪)"
+ : (r.error ?? "提交失败"),
+ );
+ if (r.ok) void refreshAdminData();
+ } finally {
+ setEnqueueingExpId(null);
}
- const baselineId = versions[0]!.id;
- const candidateId = versions[1]!.id;
- const r = await api<{ id: string }>("/internal/api/evaluation/experiments", {
- method: "POST",
- body: JSON.stringify({
- name: `exp-${Date.now()}`,
- regression_set_id: selSet || sets[0]!.id,
- baseline_version_id: baselineId,
- candidate_version_id: candidateId,
- composite_weights_json: { conversation: 0.5, memoir: 0.5 },
- }),
- });
- if (r.ok) {
- setMsg(`实验已创建 ${(r.data as { id: string }).id}`);
- await refreshExperiments();
- } else setMsg(r.error ?? "fail");
- }
-
- async function runExp(eid: string) {
- const r = await api(
- `/internal/api/evaluation/experiments/${eid}/run`,
- { method: "POST", body: "{}" },
- );
- setMsg(r.ok ? `已入队 Celery: ${eid}` : r.error ?? "fail");
- }
-
- function subscribeStream(eid: string) {
- setStreamLog([]);
- if (!apiKey) {
- setMsg("请配置 VITE_EVAL_API_KEY 以使用 SSE(query key)");
- return;
- }
- const url = `${apiBase}/internal/api/evaluation/experiments/${eid}/stream?key=${encodeURIComponent(apiKey)}`;
- const es = new EventSource(url);
- es.onmessage = (ev) => {
- setStreamLog((prev) => [...prev.slice(-20), ev.data]);
- try {
- const p = JSON.parse(ev.data) as { status?: string };
- if (p.status === "completed" || p.status === "failed") {
- es.close();
- void loadExp(eid);
- }
- } catch {
- /* ignore */
- }
- };
- es.onerror = () => {
- setMsg("SSE 连接错误(检查内网 API / Key)");
- es.close();
- };
}
return (
-
-
内部回归评测台
-
- API: {apiBase} · 配置{" "}
- VITE_EVAL_API_BASE / VITE_EVAL_API_KEY
-
- {msg ?
{msg}
: null}
+
+
+ 回归评测台
+
+ {apiBaseHint}
+ {evalReachable === "ok" ? (
+ · /ping OK
+ ) : evalReachable === "bad" ? (
+
+ · 连不上 internal /ping,请起{" "}
+ uvicorn app.internal_main:internal_app --port 8001 或{" "}
+ internal-eval.sh
+
+ ) : null}
+
+ · 网页是 5174;8001 仅为 API
+
+
+
+
+
+
+
-
);
diff --git a/app-eval-web/src/index.css b/app-eval-web/src/index.css
index 387ea1f..e2b573b 100644
--- a/app-eval-web/src/index.css
+++ b/app-eval-web/src/index.css
@@ -1,8 +1,8 @@
:root {
font-family: system-ui, sans-serif;
line-height: 1.5;
- color: #1a1a1a;
- background: #f5f5f7;
+ color: #e6edf3;
+ background: #0f1419;
}
* {
box-sizing: border-box;
diff --git a/app-eval-web/vite.config.ts b/app-eval-web/vite.config.ts
index a930ad5..bbb4607 100644
--- a/app-eval-web/vite.config.ts
+++ b/app-eval-web/vite.config.ts
@@ -1,9 +1,19 @@
import react from "@vitejs/plugin-react";
import { defineConfig } from "vite";
+/**
+ * 开发时可将 VITE_EVAL_API_BASE 留空,前端请求 /internal/... 由 Vite 转发到 8001,
+ * 避免连错端口、CORS 或浏览器策略导致看似 404。
+ */
export default defineConfig({
plugins: [react()],
server: {
port: 5174,
+ proxy: {
+ "/internal": {
+ target: "http://127.0.0.1:8001",
+ changeOrigin: true,
+ },
+ },
},
});