feat(evaluation): session catalog, user export import, and eval web UI
- Extend evaluation API: schemas, router, repo, admin and execution services - Improve user export markdown importer; add fixtures and importer tests - Session catalog repo/service updates; internal app wiring and docs - Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)
This commit is contained in:
@@ -23,6 +23,17 @@ from app.features.evaluation.models import EvalCase, EvalRun, EvalVersion
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||||
_MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
|
||||
|
||||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
return s
|
||||
return f"{s[:max_chars]}\n\n…(已截断供评审)"
|
||||
|
||||
|
||||
def _composite(
|
||||
conv: float | None, mem: float | None, weights: dict[str, Any] | None
|
||||
@@ -149,7 +160,66 @@ async def execute_eval_run(
|
||||
|
||||
memoir_md = simple_memoir_from_transcript(utterances, replies)
|
||||
mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
|
||||
mem_total = mem_out.total_score if mem_out else None
|
||||
|
||||
chapter_entries: list[dict[str, Any]] = []
|
||||
story_entries: list[dict[str, Any]] = []
|
||||
uid = (case.source_user_id or "").strip()
|
||||
if uid:
|
||||
from app.features.memoir.repo import get_chapters_for_memoir_list
|
||||
from app.features.story.repo import get_stories_for_user
|
||||
|
||||
try:
|
||||
chapters = await get_chapters_for_memoir_list(
|
||||
uid, db, active_only=True, is_new_only=None
|
||||
)
|
||||
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
|
||||
body = (ch.canonical_markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
|
||||
cj = await judge.judge_memoir(memoir_markdown=md)
|
||||
chapter_entries.append(
|
||||
{
|
||||
"id": ch.id,
|
||||
"title": ch.title,
|
||||
"order_index": ch.order_index,
|
||||
"judge": cj.model_dump() if cj else None,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("eval chapter judges skipped: {}", e)
|
||||
|
||||
try:
|
||||
stories = await get_stories_for_user(db, uid, status="active")
|
||||
for st in stories[:_MAX_EVAL_STORIES]:
|
||||
body = (st.canonical_markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||||
sj = await judge.judge_memoir(memoir_markdown=md)
|
||||
story_entries.append(
|
||||
{
|
||||
"id": st.id,
|
||||
"title": st.title,
|
||||
"stage": st.stage,
|
||||
"judge": sj.model_dump() if sj else None,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("eval story judges skipped: {}", e)
|
||||
|
||||
mem_parts: list[float] = []
|
||||
if mem_out is not None:
|
||||
mem_parts.append(float(mem_out.total_score))
|
||||
for row in chapter_entries:
|
||||
j = row.get("judge")
|
||||
if isinstance(j, dict) and j.get("total_score") is not None:
|
||||
mem_parts.append(float(j["total_score"]))
|
||||
for row in story_entries:
|
||||
j = row.get("judge")
|
||||
if isinstance(j, dict) and j.get("total_score") is not None:
|
||||
mem_parts.append(float(j["total_score"]))
|
||||
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
|
||||
|
||||
exp = await eval_repo.get_experiment(db, run.experiment_id)
|
||||
weights = exp.composite_weights_json if exp else None
|
||||
@@ -158,6 +228,8 @@ async def execute_eval_run(
|
||||
bundle: dict[str, Any] = {
|
||||
"conversation_judge": conv_out.model_dump() if conv_out else None,
|
||||
"memoir_judge": mem_out.model_dump() if mem_out else None,
|
||||
"chapters": chapter_entries,
|
||||
"stories": story_entries,
|
||||
}
|
||||
await eval_repo.update_run(
|
||||
db,
|
||||
|
||||
Reference in New Issue
Block a user