feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services - Improve user export markdown importer; add fixtures and importer tests - Session catalog repo/service updates; internal app wiring and docs - Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)
2026-04-06 13:45:04 +08:00
parent b75edacb5f
commit ca8bcc8489
17 changed files with 2062 additions and 296 deletions
--- a/api/app/features/evaluation/admin_service.py
+++ b/api/app/features/evaluation/admin_service.py
@@ -26,16 +26,25 @@ from app.features.evaluation.models import (
    EvalRunTurn,
    EvalVersion,
 )
+from app.features.evaluation.presenters import run_out
 from app.features.evaluation.schemas import (
    CaseCreate,
    ExperimentCreate,
    ImportJsonCaseBody,
    ImportMarkdownBody,
    RegressionSetCreate,
+    SessionEvalRunItem,
+    SessionEvalRunsOut,
    SnapshotFromConversationBody,
    VersionCreate,
 )
 from app.features.evaluation.session_catalog_service import SessionCatalogService
+from app.features.evaluation.user_export_fixtures import (
+    list_user_export_fixture_names as list_user_export_md_filenames,
+)
+from app.features.evaluation.user_export_fixtures import (
+    read_user_export_fixture,
+)
 from app.tasks.evaluation_tasks import run_eval_experiment_task


@@ -188,6 +197,23 @@ class EvaluationAdminService:
    async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
        return await eval_repo.list_experiments(self._db, limit=limit)

+    async def list_session_evaluation_runs(
+        self, conversation_id: str
+    ) -> SessionEvalRunsOut:
+        rows = await eval_repo.list_runs_for_source_conversation(
+            self._db, source_conversation_id=conversation_id
+        )
+        items: list[SessionEvalRunItem] = []
+        for run, _case, exp in rows:
+            turns = await eval_repo.list_turns(self._db, run.id)
+            items.append(
+                SessionEvalRunItem(
+                    experiment_name=exp.name,
+                    run=run_out(run, turns),
+                )
+            )
+        return SessionEvalRunsOut(conversation_id=conversation_id, items=items)
+
    async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
        rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
        if not rs:
@@ -232,7 +258,6 @@ class EvaluationAdminService:
    async def experiment_stream_snapshot(
        self, experiment_id: str
    ) -> dict[str, Any] | None:
-        from app.features.evaluation.presenters import run_out
        from app.features.evaluation.schemas import GateVerdictOut

        exp = await eval_repo.get_experiment(self._db, experiment_id)
@@ -250,3 +275,10 @@ class EvaluationAdminService:
            "runs": run_payload,
            "gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
        }
+
+    def list_user_export_fixture_names(self) -> list[str]:
+        return list_user_export_md_filenames()
+
+    def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]:
+        turns, _ = read_user_export_fixture(filename)
+        return turns
--- a/api/app/features/evaluation/execution_service.py
+++ b/api/app/features/evaluation/execution_service.py
@@ -23,6 +23,17 @@ from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion

 logger = get_logger(__name__)

+_MAX_JUDGE_MARKDOWN_CHARS = 20_000
+_MAX_EVAL_CHAPTERS = 30
+_MAX_EVAL_STORIES = 40
+
+
+def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
+    s = (text or "").strip()
+    if len(s) <= max_chars:
+        return s
+    return f"{s[:max_chars]}\n\n…（已截断供评审）"
+

 def _composite(
    conv: float | None, mem: float | None, weights: dict[str, Any] | None
@@ -149,7 +160,66 @@ async def execute_eval_run(

    memoir_md = simple_memoir_from_transcript(utterances, replies)
    mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
-    mem_total = mem_out.total_score if mem_out else None
+
+    chapter_entries: list[dict[str, Any]] = []
+    story_entries: list[dict[str, Any]] = []
+    uid = (case.source_user_id or "").strip()
+    if uid:
+        from app.features.memoir.repo import get_chapters_for_memoir_list
+        from app.features.story.repo import get_stories_for_user
+
+        try:
+            chapters = await get_chapters_for_memoir_list(
+                uid, db, active_only=True, is_new_only=None
+            )
+            for ch in chapters[:_MAX_EVAL_CHAPTERS]:
+                body = (ch.canonical_markdown or "").strip()
+                if not body:
+                    continue
+                md = f"# 章节：{ch.title}\n\n{_clip_md_for_judge(body)}"
+                cj = await judge.judge_memoir(memoir_markdown=md)
+                chapter_entries.append(
+                    {
+                        "id": ch.id,
+                        "title": ch.title,
+                        "order_index": ch.order_index,
+                        "judge": cj.model_dump() if cj else None,
+                    }
+                )
+        except Exception as e:
+            logger.warning("eval chapter judges skipped: {}", e)
+
+        try:
+            stories = await get_stories_for_user(db, uid, status="active")
+            for st in stories[:_MAX_EVAL_STORIES]:
+                body = (st.canonical_markdown or "").strip()
+                if not body:
+                    continue
+                md = f"# 故事：{st.title}\n\n{_clip_md_for_judge(body)}"
+                sj = await judge.judge_memoir(memoir_markdown=md)
+                story_entries.append(
+                    {
+                        "id": st.id,
+                        "title": st.title,
+                        "stage": st.stage,
+                        "judge": sj.model_dump() if sj else None,
+                    }
+                )
+        except Exception as e:
+            logger.warning("eval story judges skipped: {}", e)
+
+    mem_parts: list[float] = []
+    if mem_out is not None:
+        mem_parts.append(float(mem_out.total_score))
+    for row in chapter_entries:
+        j = row.get("judge")
+        if isinstance(j, dict) and j.get("total_score") is not None:
+            mem_parts.append(float(j["total_score"]))
+    for row in story_entries:
+        j = row.get("judge")
+        if isinstance(j, dict) and j.get("total_score") is not None:
+            mem_parts.append(float(j["total_score"]))
+    mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None

    exp = await eval_repo.get_experiment(db, run.experiment_id)
    weights = exp.composite_weights_json if exp else None
@@ -158,6 +228,8 @@ async def execute_eval_run(
    bundle: dict[str, Any] = {
        "conversation_judge": conv_out.model_dump() if conv_out else None,
        "memoir_judge": mem_out.model_dump() if mem_out else None,
+        "chapters": chapter_entries,
+        "stories": story_entries,
    }
    await eval_repo.update_run(
        db,
--- a/api/app/features/evaluation/importers/user_export_markdown.py
+++ b/api/app/features/evaluation/importers/user_export_markdown.py
@@ -17,3 +17,30 @@ def extract_user_utterances_from_export_md(text: str) -> list[str]:
        if chunk and chunk != "（空）":
            out.append(chunk)
    return out
+
+
+def extract_dialogue_turns_from_export_md(text: str) -> list[tuple[str, str]]:
+    """
+    从 extract_sql_to_user_md 导出的 Markdown 中按「轮次」提取 (用户, AI) 对，供评测台对照。
+    """
+    chunks = re.split(r"\n####\s*轮次\s*\d+[^\n]*", text)
+    out: list[tuple[str, str]] = []
+    for chunk in chunks[1:]:
+        user_m = re.search(
+            r"\*\*用户:\*\*\s*\n+(.+?)(?=\n\*\*AI:\*\*)",
+            chunk,
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+        ai_m = re.search(
+            r"\*\*AI:\*\*\s*\n+(.+?)(?=\n####\s|\n###\s+[^#]|\Z)",
+            chunk,
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+        if not user_m:
+            continue
+        u = (user_m.group(1) or "").strip()
+        if not u or u == "（空）":
+            continue
+        a = ((ai_m.group(1) if ai_m else "") or "").strip()
+        out.append((u, a))
+    return out
--- a/api/app/features/evaluation/presenters.py
+++ b/api/app/features/evaluation/presenters.py
@@ -21,5 +21,6 @@ def run_out(row, turns: list) -> EvalRunOut:
        conversation_score_total=row.conversation_score_total,
        memoir_score_total=row.memoir_score_total,
        composite_score=row.composite_score,
+        judge_bundle_json=row.judge_bundle_json,
        turns=[RunTurnOut.model_validate(t) for t in turns],
    )
--- a/api/app/features/evaluation/repo.py
+++ b/api/app/features/evaluation/repo.py
@@ -204,6 +204,27 @@ async def list_runs_for_experiment(
    return list(res.scalars().all())


+async def list_runs_for_source_conversation(
+    db: AsyncSession,
+    *,
+    source_conversation_id: str,
+    limit: int = 80,
+) -> list[tuple[EvalRun, EvalCase, EvalExperiment]]:
+    stmt = (
+        select(EvalRun, EvalCase, EvalExperiment)
+        .join(EvalCase, EvalRun.case_id == EvalCase.id)
+        .join(EvalExperiment, EvalRun.experiment_id == EvalExperiment.id)
+        .where(EvalCase.source_conversation_id == source_conversation_id)
+        .order_by(
+            EvalRun.completed_at.desc().nulls_last(),
+            EvalRun.started_at.desc().nulls_last(),
+        )
+        .limit(limit)
+    )
+    res = await db.execute(stmt)
+    return list(res.all())
+
+
 async def update_run(
    db: AsyncSession,
    run: EvalRun,
--- a/api/app/features/evaluation/router.py
+++ b/api/app/features/evaluation/router.py
@@ -28,10 +28,15 @@ from app.features.evaluation.schemas import (
    ImportMarkdownBody,
    RegressionSetCreate,
    RegressionSetOut,
+    SessionDialogueOut,
+    SessionEvalRunsOut,
    SessionListItem,
    SessionListResponse,
    SessionTranscriptOut,
    SnapshotFromConversationBody,
+    UserExportFixtureDetailOut,
+    UserExportFixtureListOut,
+    UserExportFixtureTurnOut,
    VersionCreate,
    VersionOut,
 )
@@ -40,6 +45,12 @@ from app.features.evaluation.session_catalog_service import SessionCatalogServic
 router = APIRouter(tags=["internal-evaluation"])


+@router.get("/ping", include_in_schema=False)
+async def eval_api_ping() -> dict[str, str | bool]:
+    """无鉴权：确认当前进程是 internal_main 且路由已挂载。"""
+    return {"ok": True, "service": "life-echo-internal-eval"}
+
+
 def _eval_http_exc(
    e: EvaluationNotFoundError | EvaluationBadRequestError,
 ) -> HTTPException:
@@ -123,17 +134,23 @@ async def list_sessions(
    limit: int = Query(50, ge=1, le=200),
    user_id: str | None = Query(None),
    q: str | None = Query(None),
+    status: str | None = Query(
+        None,
+        description="按会话 status 过滤，如 active",
+    ),
 ):
    catalog = SessionCatalogService(db)
    rows, total = await catalog.list_sessions(
-        offset=offset, limit=limit, user_id=user_id, q=q
+        offset=offset, limit=limit, user_id=user_id, q=q, status=status
    )
    return SessionListResponse(
        items=[
            SessionListItem(
                id=r.id,
                user_id=r.user_id,
+                user_phone=r.user_phone,
                started_at=r.started_at,
+                last_message_at=r.last_message_at,
                conversation_stage=r.conversation_stage,
                current_topic=r.current_topic,
                status=r.status,
@@ -144,6 +161,22 @@ async def list_sessions(
    )


+@router.get(
+    "/sessions/{conversation_id}/dialogue",
+    response_model=SessionDialogueOut,
+)
+async def get_session_dialogue(
+    conversation_id: str,
+    _auth: InternalEvalAuth,
+    db: Annotated[AsyncSession, Depends(get_async_db)],
+):
+    catalog = SessionCatalogService(db)
+    out = await catalog.get_session_dialogue(conversation_id)
+    if not out:
+        raise HTTPException(status_code=404, detail="conversation not found")
+    return out
+
+
@router.get(
    "/sessions/{conversation_id}/transcript", response_model=SessionTranscriptOut
 )
@@ -164,6 +197,52 @@ async def get_session_transcript(
    )


+@router.get(
+    "/sessions/{conversation_id}/evaluation-runs",
+    response_model=SessionEvalRunsOut,
+)
+async def list_session_evaluation_runs(
+    conversation_id: str,
+    _auth: InternalEvalAuth,
+    svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
+):
+    return await svc.list_session_evaluation_runs(conversation_id)
+
+
+@router.get(
+    "/fixtures/user-exports",
+    response_model=UserExportFixtureListOut,
+)
+async def list_user_export_fixtures(
+    _auth: InternalEvalAuth,
+    svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
+):
+    return UserExportFixtureListOut(items=svc.list_user_export_fixture_names())
+
+
+@router.get(
+    "/fixtures/user-exports/{filename}",
+    response_model=UserExportFixtureDetailOut,
+)
+async def get_user_export_fixture(
+    filename: str,
+    _auth: InternalEvalAuth,
+    svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
+):
+    try:
+        turns = svc.load_user_export_fixture_turns(filename)
+    except ValueError:
+        raise HTTPException(
+            status_code=400, detail="invalid fixture filename"
+        ) from None
+    except FileNotFoundError:
+        raise HTTPException(status_code=404, detail="fixture not found") from None
+    return UserExportFixtureDetailOut(
+        filename=filename,
+        turns=[UserExportFixtureTurnOut(user=u, ai=a) for u, a in turns],
+    )
+
+
@router.post("/regression-sets/{set_id}/import-markdown", response_model=CaseOut)
 async def import_markdown_case(
    set_id: str,
--- a/api/app/features/evaluation/schemas.py
+++ b/api/app/features/evaluation/schemas.py
@@ -88,10 +88,25 @@ class ExperimentOut(BaseModel):
    completed_at: datetime | None


+class SessionDialogueMessageOut(BaseModel):
+    model_config = ConfigDict(from_attributes=True)
+
+    role: str
+    content: str
+    created_at: datetime | None = None
+
+
+class SessionDialogueOut(BaseModel):
+    conversation_id: str
+    messages: list[SessionDialogueMessageOut]
+
+
 class SessionListItem(BaseModel):
    id: str
    user_id: str
+    user_phone: str | None = Field(default=None, description="users.phone，列表展示用")
    started_at: datetime | None
+    last_message_at: datetime | None = None
    conversation_stage: str | None
    current_topic: str | None
    status: str | None
@@ -109,6 +124,20 @@ class SessionTranscriptOut(BaseModel):
    user_utterances_from_messages: list[str]


+class UserExportFixtureTurnOut(BaseModel):
+    user: str
+    ai: str
+
+
+class UserExportFixtureListOut(BaseModel):
+    items: list[str]
+
+
+class UserExportFixtureDetailOut(BaseModel):
+    filename: str
+    turns: list[UserExportFixtureTurnOut]
+
+
 class SnapshotFromConversationBody(BaseModel):
    title: str | None = None
    use_messages: bool = False
@@ -157,9 +186,20 @@ class EvalRunOut(BaseModel):
    conversation_score_total: float | None
    memoir_score_total: float | None
    composite_score: float | None
+    judge_bundle_json: dict[str, Any] | None = None
    turns: list[RunTurnOut] = []


+class SessionEvalRunItem(BaseModel):
+    experiment_name: str
+    run: EvalRunOut
+
+
+class SessionEvalRunsOut(BaseModel):
+    conversation_id: str
+    items: list[SessionEvalRunItem]
+
+
 class GateVerdictOut(BaseModel):
    model_config = ConfigDict(from_attributes=True)

--- a/api/app/features/evaluation/session_catalog_repo.py
+++ b/api/app/features/evaluation/session_catalog_repo.py
@@ -4,6 +4,7 @@ from __future__ import annotations

 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm import joinedload

 from app.features.conversation.models import Conversation, ConversationMessage, Segment

@@ -12,12 +13,14 @@ class SessionCatalogRepo:
    def __init__(self, db: AsyncSession) -> None:
        self._db = db

-    async def count_conversations(self) -> int:
+    async def count_conversations(self, *, status: str | None = None) -> int:
        q = (
            select(func.count())
            .select_from(Conversation)
            .where(Conversation.deleted_at.is_(None))
        )
+        if status:
+            q = q.where(Conversation.status == status)
        r = await self._db.execute(q)
        return int(r.scalar() or 0)

@@ -28,10 +31,19 @@ class SessionCatalogRepo:
        limit: int = 50,
        user_id: str | None = None,
        q_text: str | None = None,
+        status: str | None = None,
    ) -> list[Conversation]:
        stmt = select(Conversation).where(Conversation.deleted_at.is_(None))
        if user_id:
            stmt = stmt.where(Conversation.user_id == user_id)
+        if status:
+            stmt = stmt.where(Conversation.status == status)
+        if status == "active":
+            stmt = stmt.order_by(
+                Conversation.last_message_at.desc().nullslast(),
+                Conversation.started_at.desc().nullslast(),
+            )
+        else:
            stmt = stmt.order_by(Conversation.started_at.desc().nullslast())
        stmt = stmt.offset(offset).limit(limit)
        # q_text: 简单按 topic 搜索（后续可扩展全文）
@@ -41,6 +53,7 @@ class SessionCatalogRepo:
                (Conversation.current_topic.isnot(None))
                & (Conversation.current_topic.ilike(like))
            )
+        stmt = stmt.options(joinedload(Conversation.user))
        res = await self._db.execute(stmt)
        return list(res.scalars().unique().all())

--- a/api/app/features/evaluation/session_catalog_service.py
+++ b/api/app/features/evaluation/session_catalog_service.py
@@ -6,6 +6,10 @@ from dataclasses import dataclass

 from sqlalchemy.ext.asyncio import AsyncSession

+from app.features.evaluation.schemas import (
+    SessionDialogueMessageOut,
+    SessionDialogueOut,
+)
 from app.features.evaluation.session_catalog_repo import SessionCatalogRepo


@@ -13,7 +17,9 @@ from app.features.evaluation.session_catalog_repo import SessionCatalogRepo
 class SessionSummary:
    id: str
    user_id: str
+    user_phone: str | None
    started_at: object | None
+    last_message_at: object | None
    conversation_stage: str | None
    current_topic: str | None
    status: str | None
@@ -38,16 +44,23 @@ class SessionCatalogService:
        limit: int = 50,
        user_id: str | None = None,
        q: str | None = None,
+        status: str | None = None,
    ) -> tuple[list[SessionSummary], int]:
-        total = await self._repo.count_conversations()
+        total = await self._repo.count_conversations(status=status)
        rows = await self._repo.list_conversations(
-            offset=offset, limit=limit, user_id=user_id, q_text=q
+            offset=offset,
+            limit=limit,
+            user_id=user_id,
+            q_text=q,
+            status=status,
        )
        out = [
            SessionSummary(
                id=c.id,
                user_id=c.user_id,
+                user_phone=c.user.phone if c.user is not None else None,
                started_at=c.started_at,
+                last_message_at=c.last_message_at,
                conversation_stage=c.conversation_stage,
                current_topic=c.current_topic,
                status=c.status,
@@ -56,6 +69,25 @@ class SessionCatalogService:
        ]
        return out, total

+    async def get_session_dialogue(
+        self, conversation_id: str
+    ) -> SessionDialogueOut | None:
+        c = await self._repo.get_conversation(conversation_id)
+        if not c or c.deleted_at:
+            return None
+        msgs = await self._repo.list_messages_for_conversation(conversation_id)
+        return SessionDialogueOut(
+            conversation_id=conversation_id,
+            messages=[
+                SessionDialogueMessageOut(
+                    role=m.role,
+                    content=m.content,
+                    created_at=m.created_at,
+                )
+                for m in msgs
+            ],
+        )
+
    async def get_transcript(self, conversation_id: str) -> SessionTranscript | None:
        c = await self._repo.get_conversation(conversation_id)
        if not c or c.deleted_at:
--- a/api/app/features/evaluation/user_export_fixtures.py
+++ b/api/app/features/evaluation/user_export_fixtures.py
@@ -0,0 +1,36 @@
+"""只读加载 api/tests/user_exports/*.md，供内部评测台对照（非生产数据路径）。"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+from app.features.evaluation.importers.user_export_markdown import (
+    extract_dialogue_turns_from_export_md,
+)
+
+_SAFE_MD = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_.-]*\.md$")
+
+
+def user_exports_dir() -> Path:
+    # api/app/features/evaluation/user_export_fixtures.py → api/
+    return Path(__file__).resolve().parents[3] / "tests" / "user_exports"
+
+
+def list_user_export_fixture_names() -> list[str]:
+    root = user_exports_dir()
+    if not root.is_dir():
+        return []
+    return sorted(p.name for p in root.glob("*.md"))
+
+
+def read_user_export_fixture(filename: str) -> tuple[list[tuple[str, str]], str]:
+    if not _SAFE_MD.match(filename):
+        raise ValueError("invalid fixture filename")
+    root = user_exports_dir()
+    path = (root / filename).resolve()
+    if path.parent != root.resolve() or not path.is_file():
+        raise FileNotFoundError(filename)
+    text = path.read_text(encoding="utf-8")
+    turns = extract_dialogue_turns_from_export_md(text)
+    return turns, text
--- a/api/app/internal_main.py
+++ b/api/app/internal_main.py
@@ -16,6 +16,7 @@ setup_logging()

 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles

 from app.core.config import settings
@@ -53,6 +54,27 @@ internal_app.add_middleware(
 register_exception_handlers(internal_app)


+@internal_app.get("/", include_in_schema=False, response_class=HTMLResponse)
+async def internal_eval_landing():
+    """浏览器打开 :8001 根路径时提示：界面在 Vite（默认 5174），本进程仅为 API。"""
+    docs_hint = (
+        '<p><a href="/docs">OpenAPI 文档 /docs</a></p>'
+        if settings.internal_eval_enable_docs
+        else "<p>（未开启文档；设置 INTERNAL_EVAL_ENABLE_DOCS=1 后可访问 /docs）</p>"
+    )
+    return f"""<!DOCTYPE html>
+<html lang="zh-CN"><head><meta charset="utf-8"/><title>内部评测 API</title></head>
+<body style="font-family:system-ui,sans-serif;max-width:44rem;margin:2rem auto;line-height:1.5">
+<h1>Life Echo · 内部回归评测 API</h1>
+<p>这里是 <strong>HTTP API</strong>（端口由启动命令决定），<strong>没有内置网页</strong>。
+浏览「回归评测台」请在仓库执行 <code>./internal-eval.sh</code> 或 <code>cd app-eval-web && npm run dev</code>，
+在终端里打开 Vite 给出的地址（一般为 <strong>http://127.0.0.1:5174/</strong>）。</p>
+<p>健康检查：<a href="/health">/health</a></p>
+{docs_hint}
+<p>会话与对比接口前缀：<code>/internal/api/evaluation/</code></p>
+</body></html>"""
+
+
@internal_app.on_event("startup")
 async def _startup():
    import asyncio
--- a/api/docs/internal-eval.md
+++ b/api/docs/internal-eval.md
@@ -4,6 +4,24 @@

 ## 启动

+一键脚本 `internal-eval.sh` 与 `development.sh` **不是重复各启一套主站**：
+
+| | `development.sh` | `internal-eval.sh` |
+|---|------------------|---------------------|
+| HTTP | 主站 `main:app`（默认 **8000**） | 仅评测 `internal_app`（默认 **8001**） |
+| Celery | 会起一个 worker | 默认也会起一个 worker（可与下面「瘦启动」二选一） |
+
+评测分析只需要 **8001 上的 internal API**；若你已经在跑 `development.sh`（DB/Redis/主站/已有 Celery），不必再起第二份基础设施和 worker：
+
+```bash
+cd api
+chmod +x internal-eval.sh
+# 确保 .env.development 或 .env 里有 INTERNAL_EVAL_API_KEY
+SKIP_INFRA=1 SKIP_INSTALL=1 SKIP_CELERY=1 ./internal-eval.sh   # 推荐：只多开 8001
+```
+
+全新机器、只跑评测栈时可直接 `./internal-eval.sh`（会起 docker、`uv sync`、迁移、8001 + Celery）。**默认会起 `app-eval-web`，并用 Vite `--open` 尝试打开浏览器**（`http://127.0.0.1:5174/`）。不要前端时设 `START_EVAL_WEB=0`；只要前端但不要弹窗时设 `OPEN_EVAL_WEB=0`。
+
 数据库与主服务共用；需配置环境变量后启动专用进程：

 ```bash
--- a/api/internal-eval.sh
+++ b/api/internal-eval.sh
@@ -0,0 +1,374 @@
+#!/usr/bin/env bash
+
+# 仅启动「内部回归评测」栈（app/internal_main.py），不启动主站 consumer API。
+#
+# 与 development.sh 的区别：
+#   - development.sh：main:app + Celery（通常 :8000），面向 App/主业务。
+#   - internal-eval.sh：internal_app + Celery（:8001），仅评测/回放/GLM 打分/门禁。
+#   二者共用数据库与 Redis；不会拉起第二份 main:app。
+#
+# 若本机已在跑 ./development.sh，只想多开评测 HTTP（推荐，避免第二套 worker/docker）：
+#   SKIP_INFRA=1 SKIP_INSTALL=1 SKIP_CELERY=1 ./internal-eval.sh
+#
+# 用法：cd api && ./internal-eval.sh
+# 可选环境变量：
+#   SKIP_INFRA=1          已起好 Postgres/Redis 时跳过 docker compose
+#   SKIP_INSTALL=1        跳过 uv sync
+#   SKIP_CELERY=1         仅起内部 API（别处已有 Celery worker 时）
+#   START_EVAL_WEB=0      不起评测前端（默认会起 app-eval-web，需已 npm install）
+#   OPEN_EVAL_WEB=0       起前端但不自动打开浏览器（默认 Vite --open）
+#   EVAL_WEB_PORT         打印提示用，默认 5174（与 app-eval-web/vite.config.ts 一致）
+#   INTERNAL_EVAL_PORT    默认 8001
+#   CELERY_POOL           默认 solo（与 development.sh 一致）
+
+set -euo pipefail
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${ROOT_DIR}/.." && pwd)"
+EVAL_WEB_DIR="${REPO_ROOT}/app-eval-web"
+
+VENV_DIR="${ROOT_DIR}/.venv"
+UVICORN_BIN="${VENV_DIR}/bin/uvicorn"
+CELERY_BIN="${VENV_DIR}/bin/celery"
+
+INTERNAL_EVAL_HOST="${INTERNAL_EVAL_HOST:-0.0.0.0}"
+INTERNAL_EVAL_PORT="${INTERNAL_EVAL_PORT:-8001}"
+CELERY_POOL="${CELERY_POOL:-solo}"
+SKIP_INSTALL="${SKIP_INSTALL:-0}"
+SKIP_INFRA="${SKIP_INFRA:-0}"
+SKIP_CELERY="${SKIP_CELERY:-0}"
+START_EVAL_WEB="${START_EVAL_WEB:-1}"
+OPEN_EVAL_WEB="${OPEN_EVAL_WEB:-1}"
+EVAL_WEB_PORT="${EVAL_WEB_PORT:-5174}"
+SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-12}"
+
+API_PID=""
+CELERY_PID=""
+EVAL_WEB_PID=""
+CLEANED_UP=0
+INFRA_STARTED=0
+
+print_header() {
+  echo -e "\n${BLUE}========================================${NC}"
+  echo -e "${BLUE}$1${NC}"
+  echo -e "${BLUE}========================================${NC}"
+}
+
+print_ok() {
+  echo -e "${GREEN}✓ $1${NC}"
+}
+
+print_warn() {
+  echo -e "${YELLOW}⚠ $1${NC}"
+}
+
+print_err() {
+  echo -e "${RED}✗ $1${NC}"
+}
+
+is_pid_alive() {
+  local pid="$1"
+  [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null
+}
+
+wait_pid_exit() {
+  local pid="$1"
+  local timeout="$2"
+  local waited=0
+
+  while is_pid_alive "${pid}"; do
+    if (( waited >= timeout )); then
+      return 1
+    fi
+    sleep 1
+    waited=$((waited + 1))
+  done
+  return 0
+}
+
+kill_children_term() {
+  local pid="$1"
+  local children
+
+  children="$(pgrep -P "${pid}" 2>/dev/null || true)"
+  if [[ -n "${children}" ]]; then
+    while IFS= read -r child_pid; do
+      [[ -z "${child_pid}" ]] && continue
+      kill_children_term "${child_pid}"
+      kill -TERM "${child_pid}" 2>/dev/null || true
+    done <<< "${children}"
+  fi
+}
+
+stop_process_gracefully() {
+  local name="$1"
+  local pid="$2"
+  local timeout="${3:-10}"
+
+  if ! is_pid_alive "${pid}"; then
+    print_ok "${name} 已退出"
+    return 0
+  fi
+
+  print_warn "正在停止 ${name}（PID: ${pid}）..."
+  kill_children_term "${pid}"
+  kill -TERM "${pid}" 2>/dev/null || true
+
+  if wait_pid_exit "${pid}" "${timeout}"; then
+    print_ok "${name} 已停止"
+    return 0
+  fi
+
+  print_warn "${name} 在 ${timeout}s 内未退出，准备强制结束"
+  kill -KILL "${pid}" 2>/dev/null || true
+  wait_pid_exit "${pid}" 3 || true
+  print_ok "${name} 已强制结束"
+}
+
+cleanup() {
+  if [[ "${CLEANED_UP}" == "1" ]]; then
+    return 0
+  fi
+  CLEANED_UP=1
+
+  print_header "正在关闭内部评测环境"
+
+  if is_pid_alive "${EVAL_WEB_PID}"; then
+    stop_process_gracefully "eval-web (Vite)" "${EVAL_WEB_PID}" "${SHUTDOWN_TIMEOUT}"
+  fi
+
+  if is_pid_alive "${API_PID}"; then
+    stop_process_gracefully "Internal Eval API" "${API_PID}" "${SHUTDOWN_TIMEOUT}"
+  fi
+
+  if is_pid_alive "${CELERY_PID}"; then
+    stop_process_gracefully "Celery" "${CELERY_PID}" "${SHUTDOWN_TIMEOUT}"
+  fi
+
+  if [[ "${INFRA_STARTED}" == "1" ]]; then
+    print_warn "正在停止 PostgreSQL / Redis 容器..."
+    (
+      cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml stop
+    ) >/dev/null 2>&1 || true
+    print_ok "PostgreSQL/Redis 容器已停止"
+  fi
+}
+
+require_cmd() {
+  local cmd="$1"
+  if ! command -v "${cmd}" >/dev/null 2>&1; then
+    print_err "未找到命令: ${cmd}"
+    exit 1
+  fi
+}
+
+start_infra() {
+  print_header "启动 PostgreSQL 和 Redis"
+  cd "${ROOT_DIR}"
+  docker compose -f docker-compose.dev.yml up -d
+  INFRA_STARTED=1
+  print_ok "基础设施已就绪"
+}
+
+wait_postgres_ready() {
+  local retries=30
+  local i=0
+  print_header "等待 PostgreSQL 就绪"
+  cd "${ROOT_DIR}"
+  while (( i < retries )); do
+    if docker compose -f docker-compose.dev.yml exec -T postgres \
+      pg_isready -U postgres >/dev/null 2>&1; then
+      print_ok "PostgreSQL 已就绪"
+      return 0
+    fi
+    sleep 1
+    i=$((i + 1))
+  done
+  print_warn "PostgreSQL 在 ${retries}s 内未就绪，迁移可能失败"
+  return 1
+}
+
+ensure_venv() {
+  print_header "检查 Python 虚拟环境"
+
+  if [[ ! -d "${VENV_DIR}" ]]; then
+    print_warn ".venv 不存在，正在创建"
+    uv venv "${VENV_DIR}"
+  fi
+
+  if [[ "${SKIP_INSTALL}" != "1" ]]; then
+    print_header "安装 Python 依赖"
+    uv sync
+    print_ok "依赖安装完成"
+  else
+    print_warn "已跳过依赖安装 (SKIP_INSTALL=1)"
+  fi
+}
+
+ensure_dotenv_from_development() {
+  print_header "准备本地 .env"
+  if [[ -f "${ROOT_DIR}/.env.development" ]]; then
+    cp "${ROOT_DIR}/.env.development" "${ROOT_DIR}/.env"
+    print_ok "已从 .env.development 同步为 .env"
+    return 0
+  fi
+  print_warn "未找到 .env.development，将使用现有 .env（若存在）"
+}
+
+check_internal_eval_key() {
+  print_header "检查内部评测密钥"
+  if [[ -f "${ROOT_DIR}/.env" ]] && grep -qE '^INTERNAL_EVAL_API_KEY=.+' "${ROOT_DIR}/.env" 2>/dev/null; then
+    print_ok "已在 .env 中配置 INTERNAL_EVAL_API_KEY"
+    return 0
+  fi
+  if [[ -n "${INTERNAL_EVAL_API_KEY:-}" ]]; then
+    print_ok "已从环境变量传入 INTERNAL_EVAL_API_KEY"
+    return 0
+  fi
+  print_err "未配置 INTERNAL_EVAL_API_KEY：内部评测接口将返回 503。"
+  print_err "请在 api/.env.development（或 .env）中加入一行，例如："
+  print_err "  INTERNAL_EVAL_API_KEY=\"your-long-random-secret\""
+  exit 1
+}
+
+check_env_file() {
+  print_header "检查环境变量文件"
+  if [[ ! -f "${ROOT_DIR}/.env" ]]; then
+    print_warn "未找到 .env，应用可能因缺少配置启动失败"
+  else
+    print_ok "检测到 .env"
+  fi
+}
+
+run_migrations() {
+  print_header "执行数据库迁移"
+  cd "${ROOT_DIR}"
+  if uv run alembic upgrade head 2>/dev/null; then
+    print_ok "Alembic 迁移已就绪"
+  else
+    print_warn "Alembic 迁移失败（可能数据库未启动或 DATABASE_URL 未配置），应用启动可能失败"
+  fi
+}
+
+start_eval_web() {
+  print_header "启动 app-eval-web (Vite)"
+  if [[ ! -d "${EVAL_WEB_DIR}" ]]; then
+    print_err "未找到 ${EVAL_WEB_DIR}"
+    exit 1
+  fi
+  if [[ ! -d "${EVAL_WEB_DIR}/node_modules" ]]; then
+    print_err "请先执行: cd app-eval-web && npm install"
+    exit 1
+  fi
+  require_cmd "npm"
+
+  local api_key="${INTERNAL_EVAL_API_KEY:-}"
+  if [[ -z "${api_key}" ]] && [[ -f "${ROOT_DIR}/.env" ]]; then
+    api_key="$(grep -E '^INTERNAL_EVAL_API_KEY=' "${ROOT_DIR}/.env" | head -1 | cut -d= -f2- | tr -d '\r' | sed 's/^"//;s/"$//')"
+  fi
+  if [[ -z "${api_key}" ]]; then
+    print_err "无法解析 INTERNAL_EVAL_API_KEY，无法为 Vite 注入 VITE_EVAL_API_KEY"
+    exit 1
+  fi
+
+  local vite_extra=()
+  if [[ "${OPEN_EVAL_WEB}" == "1" ]]; then
+    vite_extra+=(--open)
+  fi
+
+  # 不设 VITE_EVAL_API_BASE：前端走 Vite proxy（app-eval-web/vite.config.ts）转发到 :${INTERNAL_EVAL_PORT}，减少直连/CORS/误指主站问题。
+  # 若需直连远端 API：export VITE_EVAL_API_BASE=https://... 后再手动 npm run dev。
+  (
+    cd "${EVAL_WEB_DIR}"
+    VITE_EVAL_API_KEY="${api_key}" \
+      npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}" "${vite_extra[@]}"
+  ) &
+  EVAL_WEB_PID=$!
+  print_ok "eval-web 已启动 (PID: ${EVAL_WEB_PID}) → http://127.0.0.1:${EVAL_WEB_PORT}/"
+}
+
+start_services() {
+  print_header "启动 Internal Eval API 与 Celery"
+  cd "${ROOT_DIR}"
+
+  if command -v lsof >/dev/null 2>&1; then
+    if lsof -nP -iTCP:"${INTERNAL_EVAL_PORT}" -sTCP:LISTEN >/dev/null 2>&1; then
+      print_err "端口 ${INTERNAL_EVAL_PORT} 已被占用，无法启动内部评测 Uvicorn。"
+      print_err "请先结束占用进程，或设置 INTERNAL_EVAL_PORT 为其他端口"
+      exit 1
+    fi
+  fi
+
+  # 与主开发脚本一致：评审/生产 LLM 等从 .env 读取；文档默认关闭，本地可 export INTERNAL_EVAL_ENABLE_DOCS=1
+  "${UVICORN_BIN}" app.internal_main:internal_app --reload \
+    --reload-exclude 'alembic/**' \
+    --reload-exclude 'alembic.ini' \
+    --host "${INTERNAL_EVAL_HOST}" --port "${INTERNAL_EVAL_PORT}" &
+  API_PID=$!
+  print_ok "Internal Eval API 已启动 (PID: ${API_PID})"
+
+  if [[ "${SKIP_CELERY}" != "1" ]]; then
+    "${CELERY_BIN}" -A app.tasks.celery_app worker --loglevel=info --pool="${CELERY_POOL}" &
+    CELERY_PID=$!
+    print_ok "Celery 已启动 (PID: ${CELERY_PID})"
+  else
+    print_warn "已跳过 Celery (SKIP_CELERY=1)；实验 run 接口需要 worker 才能执行"
+  fi
+
+  if [[ "${START_EVAL_WEB}" == "1" ]]; then
+    start_eval_web
+  fi
+
+  echo
+  echo -e "${GREEN}内部评测环境启动完成${NC}"
+  echo "【请用浏览器打开】评测 Web UI:  http://127.0.0.1:${EVAL_WEB_PORT}/ （/internal 会代理到 API :${INTERNAL_EVAL_PORT}）"
+  echo "内部评测 API: http://127.0.0.1:${INTERNAL_EVAL_PORT}/health"
+  echo "评测 REST 前缀: http://127.0.0.1:${INTERNAL_EVAL_PORT}/internal/api/evaluation"
+  if [[ "${INTERNAL_EVAL_ENABLE_DOCS:-}" == "1" ]] || grep -qE '^INTERNAL_EVAL_ENABLE_DOCS=true' "${ROOT_DIR}/.env" 2>/dev/null; then
+    echo "API 文档:     http://127.0.0.1:${INTERNAL_EVAL_PORT}/docs"
+  fi
+  echo "说明文档:     api/docs/internal-eval.md"
+  echo "按 Ctrl+C 停止所有进程"
+}
+
+main() {
+  print_header "Life Echo 内部回归评测 — 一键启动"
+  echo -e "${BLUE}说明:${NC} 不启动主站 API（main:app / 默认 8000）；仅启动 internal_main（:${INTERNAL_EVAL_PORT}）。"
+  echo ""
+
+  require_cmd "uv"
+
+  trap cleanup EXIT INT TERM
+
+  if [[ "${SKIP_INFRA}" != "1" ]]; then
+    require_cmd "docker"
+    start_infra
+    wait_postgres_ready || true
+  else
+    print_warn "已跳过 docker 基础设施 (SKIP_INFRA=1)"
+  fi
+
+  ensure_venv
+  ensure_dotenv_from_development
+  check_env_file
+  check_internal_eval_key
+  run_migrations
+  start_services
+
+  local wait_pids=("${API_PID}")
+  if [[ "${SKIP_CELERY}" != "1" ]]; then
+    wait_pids+=("${CELERY_PID}")
+  fi
+  if [[ "${START_EVAL_WEB}" == "1" ]] && [[ -n "${EVAL_WEB_PID}" ]]; then
+    wait_pids+=("${EVAL_WEB_PID}")
+  fi
+  wait "${wait_pids[@]}"
+}
+
+main "$@"
--- a/api/tests/evaluation/test_importers.py
+++ b/api/tests/evaluation/test_importers.py
@@ -1,5 +1,10 @@
 from app.features.evaluation.importers.script_json import parse_script_json
+from pathlib import Path
+
+import pytest
+
 from app.features.evaluation.importers.user_export_markdown import (
+    extract_dialogue_turns_from_export_md,
    extract_user_utterances_from_export_md,
 )

@@ -27,3 +32,43 @@ hello
 hi
 """
    assert extract_user_utterances_from_export_md(md) == ["hello"]
+
+
+def test_extract_dialogue_turns_from_export_md() -> None:
+    md = """
+#### 轮次 1 — x
+
+**用户:**
+
+u1
+
+**AI:**
+
+a1
+
+#### 轮次 2 — y
+
+**用户:**
+
+u2
+
+**AI:**
+
+a2
+"""
+    turns = extract_dialogue_turns_from_export_md(md)
+    assert turns == [("u1", "a1"), ("u2", "a2")]
+
+
+def test_extract_dialogue_turns_from_repo_user_export() -> None:
+    p = (
+        Path(__file__).resolve().parents[1]
+        / "user_exports"
+        / "13701020203_e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0.md"
+    )
+    if not p.is_file():
+        pytest.skip("user export fixture not present")
+    text = p.read_text(encoding="utf-8")
+    turns = extract_dialogue_turns_from_export_md(text)
+    assert len(turns) >= 5
+    assert "你好" in turns[0][0]
--- a/app-eval-web/src/App.tsx
+++ b/app-eval-web/src/App.tsx
--- a/app-eval-web/src/index.css
+++ b/app-eval-web/src/index.css
@@ -1,8 +1,8 @@
 :root {
  font-family: system-ui, sans-serif;
  line-height: 1.5;
-  color: #1a1a1a;
-  background: #f5f5f7;
+  color: #e6edf3;
+  background: #0f1419;
 }
 * {
  box-sizing: border-box;
--- a/app-eval-web/vite.config.ts
+++ b/app-eval-web/vite.config.ts
@@ -1,9 +1,19 @@
 import react from "@vitejs/plugin-react";
 import { defineConfig } from "vite";

+/**
+ * 开发时可将 VITE_EVAL_API_BASE 留空，前端请求 /internal/... 由 Vite 转发到 8001，
+ * 避免连错端口、CORS 或浏览器策略导致看似 404。
+ */
 export default defineConfig({
  plugins: [react()],
  server: {
    port: 5174,
+    proxy: {
+      "/internal": {
+        target: "http://127.0.0.1:8001",
+        changeOrigin: true,
+      },
+    },
  },
 });