feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整
- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total - judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分 - 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断) - 访谈打分仍为情绪强化版 15 细项、总分 100 - 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑 - 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
@@ -10,6 +10,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.dependencies import get_eval_judge_langchain_llm
|
||||
from app.core.logging import get_logger
|
||||
from app.features.conversation import repo as conversation_repo
|
||||
from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
@@ -27,6 +28,8 @@ logger = get_logger(__name__)
|
||||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||||
_MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
_MAX_EVIDENCE_CONVERSATIONS = 8
|
||||
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
|
||||
|
||||
|
||||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||||
@@ -48,6 +51,41 @@ def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
return s
|
||||
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
|
||||
|
||||
|
||||
async def _conversation_transcript_for_manual(
|
||||
db: AsyncSession, conversation_id: str
|
||||
) -> str:
|
||||
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
|
||||
parts: list[str] = []
|
||||
for row in rows:
|
||||
role = (row.role or "").lower()
|
||||
body = (row.content or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
label = "用户" if role == "human" else "AI"
|
||||
out = _assistant_text_for_eval_display(body) if role != "human" else body
|
||||
parts.append(f"{label}: {out}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
|
||||
conversations = await conversation_repo.get_user_conversations(user_id, db)
|
||||
if not conversations:
|
||||
return ""
|
||||
parts: list[str] = []
|
||||
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
|
||||
transcript = await _conversation_transcript_for_manual(db, str(conv.id))
|
||||
if transcript:
|
||||
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
|
||||
return _trim_evidence_text("\n\n".join(parts))
|
||||
|
||||
|
||||
def _normalize_title_key(title: str) -> str:
|
||||
t = (title or "").strip().lower()
|
||||
t = re.sub(r"^#+\s*", "", t)
|
||||
@@ -271,6 +309,7 @@ class EvalJudgeManualService:
|
||||
judge_llm = get_eval_judge_langchain_llm()
|
||||
judge = EvalJudgeService(judge_llm)
|
||||
baselines = list(baseline_sections or [])
|
||||
evidence_transcript = await _user_transcript_evidence(self._db, uid)
|
||||
|
||||
chapter_results: list[dict[str, Any]] = []
|
||||
try:
|
||||
@@ -281,7 +320,7 @@ class EvalJudgeManualService:
|
||||
body = (ch.canonical_markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
bl = _baseline_for_chapter_title(baselines, ch.title or "", i)
|
||||
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
|
||||
baseline_excerpt = ""
|
||||
if bl and (bl.body or "").strip():
|
||||
baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
|
||||
@@ -289,7 +328,14 @@ class EvalJudgeManualService:
|
||||
if baseline_excerpt:
|
||||
md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n"
|
||||
md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
|
||||
cj = await judge.judge_memoir(memoir_markdown=md)
|
||||
cj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
reference_memoir_markdown=baseline_excerpt,
|
||||
evidence_notes=(
|
||||
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
|
||||
),
|
||||
)
|
||||
chapter_results.append(
|
||||
{
|
||||
"id": ch.id,
|
||||
@@ -310,7 +356,13 @@ class EvalJudgeManualService:
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||||
sj = await judge.judge_memoir(memoir_markdown=md)
|
||||
sj = await judge.judge_memoir(
|
||||
memoir_markdown=md,
|
||||
source_transcript=evidence_transcript,
|
||||
evidence_notes=(
|
||||
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
|
||||
),
|
||||
)
|
||||
story_results.append(
|
||||
{
|
||||
"id": st.id,
|
||||
|
||||
Reference in New Issue
Block a user