feat/ 导出开发容器内的数据用于评估

2026-04-03 14:44:46 +08:00
parent 828a29748e
commit b75edacb5f
51 changed files with 5999 additions and 57 deletions
--- a/api/tests/evaluation/test_gating_service.py
+++ b/api/tests/evaluation/test_gating_service.py
@@ -0,0 +1,54 @@
+from unittest.mock import MagicMock
+
+from app.features.evaluation.gating_service import compute_gate
+
+
+def _case(cid: str, protected: bool = False):
+    c = MagicMock()
+    c.id = cid
+    c.title = None
+    c.is_protected = protected
+    return c
+
+
+def _run(case_id: str, side: str, composite: float, status: str = "completed"):
+    r = MagicMock()
+    r.case_id = case_id
+    r.side = side
+    r.status = status
+    r.composite_score = composite
+    return r
+
+
+def test_gate_passes_when_mean_up_and_no_protected_regression() -> None:
+    cases = [_case("1"), _case("2")]
+    runs = [
+        _run("1", "baseline", 50),
+        _run("1", "candidate", 60),
+        _run("2", "baseline", 40),
+        _run("2", "candidate", 55),
+    ]
+    g = compute_gate(cases=cases, runs=runs, regression_threshold=2.0)
+    assert g.passed
+    assert g.mean_delta > 0
+
+
+def test_gate_fails_on_protected_regression() -> None:
+    cases = [_case("1", protected=True)]
+    runs = [
+        _run("1", "baseline", 80.0),
+        _run("1", "candidate", 75.0),
+    ]
+    g = compute_gate(cases=cases, runs=runs, regression_threshold=2.0)
+    assert not g.passed
+    assert len(g.protected_regressions) == 1
+
+
+def test_gate_fails_when_mean_not_higher() -> None:
+    cases = [_case("1")]
+    runs = [
+        _run("1", "baseline", 70.0),
+        _run("1", "candidate", 69.0),
+    ]
+    g = compute_gate(cases=cases, runs=runs, regression_threshold=2.0)
+    assert not g.passed
--- a/api/tests/evaluation/test_importers.py
+++ b/api/tests/evaluation/test_importers.py
@@ -0,0 +1,29 @@
+from app.features.evaluation.importers.script_json import parse_script_json
+from app.features.evaluation.importers.user_export_markdown import (
+    extract_user_utterances_from_export_md,
+)
+
+
+def test_parse_script_json_list() -> None:
+    u, meta = parse_script_json('["a", "b"]')
+    assert u == ["a", "b"]
+    assert meta == {}
+
+
+def test_parse_script_json_object() -> None:
+    u, meta = parse_script_json('{"utterances":["x"],"foo":1}')
+    assert u == ["x"]
+    assert meta == {"foo": 1}
+
+
+def test_extract_user_lines_from_export_md() -> None:
+    md = """
+**用户:**
+
+hello
+
+**AI:**
+
+hi
+"""
+    assert extract_user_utterances_from_export_md(md) == ["hello"]
--- a/api/tests/evaluation/test_internal_router_auth.py
+++ b/api/tests/evaluation/test_internal_router_auth.py
@@ -0,0 +1,65 @@
+"""内部路由在未配密钥时应 503。"""
+
+import pytest
+from httpx import ASGITransport, AsyncClient
+
+from app.features.evaluation.internal_auth import get_internal_eval_principal
+from app.features.evaluation.router import router
+
+
+@pytest.mark.asyncio
+async def test_internal_eval_list_sets_requires_config(monkeypatch: pytest.MonkeyPatch):
+    from fastapi import FastAPI
+
+    monkeypatch.setattr(
+        "app.core.config.settings.internal_eval_api_key",
+        "",
+        raising=False,
+    )
+    app = FastAPI()
+    app.include_router(router, prefix="/internal/api/evaluation")
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://t") as client:
+        r = await client.get("/internal/api/evaluation/regression-sets")
+    assert r.status_code == 503
+
+
+@pytest.mark.asyncio
+async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.MonkeyPatch):
+    from fastapi import FastAPI
+
+    monkeypatch.setattr(
+        "app.core.config.settings.internal_eval_api_key",
+        "secret",
+        raising=False,
+    )
+    app = FastAPI()
+    app.include_router(router, prefix="/internal/api/evaluation")
+
+    async def _override_auth():
+        from app.features.evaluation.internal_auth import InternalEvalPrincipal
+
+        return InternalEvalPrincipal()
+
+    app.dependency_overrides[get_internal_eval_principal] = _override_auth
+    from app.core.db import get_async_db
+    from unittest.mock import AsyncMock, MagicMock
+
+    mock_session = AsyncMock()
+    mock_result = MagicMock()
+    mock_result.scalars.return_value.unique.return_value.all.return_value = []
+    mock_session.execute = AsyncMock(return_value=mock_result)
+
+    async def _db():
+        yield mock_session
+
+    app.dependency_overrides[get_async_db] = _db
+
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://t") as client:
+        r = await client.get(
+            "/internal/api/evaluation/regression-sets",
+            headers={"X-Internal-Eval-Key": "secret"},
+        )
+    assert r.status_code == 200
+    assert r.json() == []