feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整

- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断)
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试
This commit is contained in:
Kevin
2026-04-07 10:34:59 +08:00
parent ea97427767
commit 5972b0e721
9 changed files with 616 additions and 235 deletions

View File

@@ -26,6 +26,8 @@ logger = get_logger(__name__)
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -56,6 +58,50 @@ def _assistant_text_for_eval_display(raw: str) -> str:
return (raw or "").replace("[SPLIT]", "\n")
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
def _dialogue_transcript_from_pairs(pairs: list[tuple[str, str]]) -> str:
parts: list[str] = []
for role, content in pairs:
body = (content or "").strip()
if not body:
continue
label = "用户" if role == "human" else "AI"
out = _assistant_text_for_eval_display(body) if role != "human" else body
parts.append(f"{label}: {out}")
return "\n\n".join(parts)
async def _conversation_transcript_for_eval(
db: AsyncSession, conversation_id: str
) -> str:
from app.features.conversation import repo as conversation_repo
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
return _dialogue_transcript_from_pairs(
[(str(row.role or "").lower(), str(row.content or "")) for row in rows]
)
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
from app.features.conversation import repo as conversation_repo
conversations = await conversation_repo.get_user_conversations(user_id, db)
if not conversations:
return ""
parts: list[str] = []
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
transcript = await _conversation_transcript_for_eval(db, str(conv.id))
if transcript:
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
return _trim_evidence_text("\n\n".join(parts))
async def execute_eval_run(
db: AsyncSession,
*,
@@ -150,7 +196,7 @@ async def execute_eval_run(
rationale = tj.rationale if tj else None
await eval_repo.add_turn(
db,
run_id=run.id,
run_id=str(run.id),
turn_index=idx,
user_utterance=u,
assistant_reply=replies[idx],
@@ -166,11 +212,36 @@ async def execute_eval_run(
conv_total = conv_out.total_score if conv_out else None
memoir_md = simple_memoir_from_transcript(utterances, replies)
mem_out = await judge.judge_memoir(memoir_markdown=memoir_md)
source_transcript = _trim_evidence_text(full_transcript)
reference_memoir = (case.reference_memoir_markdown or "").strip()
mem_out = await judge.judge_memoir(
memoir_markdown=memoir_md,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir,
evidence_notes="严格按文档核对真实性、覆盖率、可追溯性;以原始访谈为主,参考基线仅作辅助。",
)
chapter_entries: list[dict[str, Any]] = []
story_entries: list[dict[str, Any]] = []
uid = (case.source_user_id or "").strip()
source_conversation_id = (case.source_conversation_id or "").strip()
evidence_transcript = source_transcript
if source_conversation_id:
try:
conversation_evidence = await _conversation_transcript_for_eval(
db, source_conversation_id
)
if conversation_evidence:
evidence_transcript = _trim_evidence_text(conversation_evidence)
except Exception as e:
logger.warning("eval source conversation evidence skipped: {}", e)
elif uid:
try:
user_evidence = await _user_transcript_evidence(db, uid)
if user_evidence:
evidence_transcript = user_evidence
except Exception as e:
logger.warning("eval user transcript evidence skipped: {}", e)
if uid:
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user
@@ -184,7 +255,14 @@ async def execute_eval_run(
if not body:
continue
md = f"# 章节:{ch.title}\n\n{_clip_md_for_judge(body)}"
cj = await judge.judge_memoir(memoir_markdown=md)
cj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
reference_memoir_markdown=reference_memoir,
evidence_notes=(
"这是用户现有章节的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
),
)
chapter_entries.append(
{
"id": ch.id,
@@ -203,7 +281,14 @@ async def execute_eval_run(
if not body:
continue
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
sj = await judge.judge_memoir(memoir_markdown=md)
sj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
reference_memoir_markdown=reference_memoir,
evidence_notes=(
"这是用户现有故事的严格评审;真实性、覆盖率、可追溯性必须对照原始访谈证据。"
),
)
story_entries.append(
{
"id": st.id,
@@ -228,8 +313,12 @@ async def execute_eval_run(
mem_parts.append(float(j["total_score"]))
mem_total = sum(mem_parts) / len(mem_parts) if mem_parts else None
exp = await eval_repo.get_experiment(db, run.experiment_id)
weights = exp.composite_weights_json if exp else None
exp = await eval_repo.get_experiment(db, str(run.experiment_id))
weights = (
exp.composite_weights_json
if exp and isinstance(exp.composite_weights_json, dict)
else None
)
comp = _composite(conv_total, mem_total, weights)
bundle: dict[str, Any] = {
@@ -257,13 +346,13 @@ async def _finalize_experiment_gate(db: AsyncSession, experiment_id: str) -> Non
exp = await eval_repo.get_experiment(db, experiment_id)
if not exp:
return
cases = await eval_repo.list_cases(db, exp.regression_set_id)
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
incomplete = [r for r in runs if r.status not in ("completed", "failed")]
incomplete = [r for r in runs if str(r.status) not in ("completed", "failed")]
if incomplete:
return
failed = [r for r in runs if r.status == "failed"]
failed = [r for r in runs if str(r.status) == "failed"]
if failed:
await eval_repo.update_experiment(
db,
@@ -301,10 +390,10 @@ async def execute_experiment_full(experiment_id: str) -> None:
await eval_repo.update_experiment(db, exp, status="running")
await db.commit()
cases = await eval_repo.list_cases(db, exp.regression_set_id)
base_v = await eval_repo.get_version(db, exp.baseline_version_id)
cand_v = await eval_repo.get_version(db, exp.candidate_version_id)
if not base_v or not cand_v:
cases = await eval_repo.list_cases(db, str(exp.regression_set_id))
base_v = await eval_repo.get_version(db, str(exp.baseline_version_id))
cand_v = await eval_repo.get_version(db, str(exp.candidate_version_id))
if base_v is None or cand_v is None:
await eval_repo.update_experiment(
db,
exp,
@@ -317,12 +406,12 @@ async def execute_experiment_full(experiment_id: str) -> None:
for case in cases:
for side, ver in ("baseline", base_v), ("candidate", cand_v):
run = await eval_repo.get_run(db, experiment_id, case.id, side)
run = await eval_repo.get_run(db, experiment_id, str(case.id), side)
if not run:
run = await eval_repo.create_run(
db,
experiment_id=experiment_id,
case_id=case.id,
case_id=str(case.id),
side=side,
)
await db.commit()

View File

@@ -10,6 +10,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.core.dependencies import get_eval_judge_langchain_llm
from app.core.logging import get_logger
from app.features.conversation import repo as conversation_repo
from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
@@ -27,6 +28,8 @@ logger = get_logger(__name__)
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
@@ -48,6 +51,41 @@ def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
return "\n\n".join(parts)
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
async def _conversation_transcript_for_manual(
db: AsyncSession, conversation_id: str
) -> str:
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
parts: list[str] = []
for row in rows:
role = (row.role or "").lower()
body = (row.content or "").strip()
if not body:
continue
label = "用户" if role == "human" else "AI"
out = _assistant_text_for_eval_display(body) if role != "human" else body
parts.append(f"{label}: {out}")
return "\n\n".join(parts)
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
conversations = await conversation_repo.get_user_conversations(user_id, db)
if not conversations:
return ""
parts: list[str] = []
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
transcript = await _conversation_transcript_for_manual(db, str(conv.id))
if transcript:
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
return _trim_evidence_text("\n\n".join(parts))
def _normalize_title_key(title: str) -> str:
t = (title or "").strip().lower()
t = re.sub(r"^#+\s*", "", t)
@@ -271,6 +309,7 @@ class EvalJudgeManualService:
judge_llm = get_eval_judge_langchain_llm()
judge = EvalJudgeService(judge_llm)
baselines = list(baseline_sections or [])
evidence_transcript = await _user_transcript_evidence(self._db, uid)
chapter_results: list[dict[str, Any]] = []
try:
@@ -281,7 +320,7 @@ class EvalJudgeManualService:
body = (ch.canonical_markdown or "").strip()
if not body:
continue
bl = _baseline_for_chapter_title(baselines, ch.title or "", i)
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
baseline_excerpt = ""
if bl and (bl.body or "").strip():
baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
@@ -289,7 +328,14 @@ class EvalJudgeManualService:
if baseline_excerpt:
md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n"
md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
cj = await judge.judge_memoir(memoir_markdown=md)
cj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
reference_memoir_markdown=baseline_excerpt,
evidence_notes=(
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
),
)
chapter_results.append(
{
"id": ch.id,
@@ -310,7 +356,13 @@ class EvalJudgeManualService:
if not body:
continue
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
sj = await judge.judge_memoir(memoir_markdown=md)
sj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
evidence_notes=(
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
),
)
story_results.append(
{
"id": st.id,

View File

@@ -1,43 +1,199 @@
"""评审 LLM 结构化输出json_object"""
"""评审 LLM 结构化输出json_object
成稿(回忆录)子项上限已自洽为 **总分 100**(由原 110 分表等比例收紧整数档,见附件 rubric
"""
from __future__ import annotations
from pydantic import BaseModel, Field
from typing import Self
from pydantic import BaseModel, Field, model_validator
class TurnJudgeOutput(BaseModel):
"""单轮对话质量(情绪 + 流畅度/重复抑制 + 总分)。"""
"""单轮 / 整段对话质量(情绪强化版 100 分15 个细项)。"""
# 一、情绪价值与陪伴感30
emotion_carry: float = Field(ge=0, le=10, description="情绪承接能力")
empathy_depth: float = Field(ge=0, le=8, description="共情深度")
emotion_safety: float = Field(ge=0, le=6, description="情绪安全感")
emotion_guidance: float = Field(ge=0, le=6, description="情绪引导能力")
# 二、信息获取能力25
fact_mining: float = Field(ge=0, le=8, description="关键事实挖掘")
info_completeness_guide: float = Field(ge=0, le=8, description="信息完整性引导")
info_depth_mining: float = Field(ge=0, le=9, description="信息深度挖掘")
# 三、人物建模能力15
persona_understanding: float = Field(ge=0, le=7, description="人物理解")
persona_consistency_verify: float = Field(ge=0, le=4, description="人物一致性验证")
persona_expression_guide: float = Field(ge=0, le=4, description="人物表达引导")
# 四、结构化引导15
interview_structure: float = Field(ge=0, le=6, description="访谈结构")
context_memory: float = Field(ge=0, le=5, description="上下文记忆")
rhythm_control: float = Field(ge=0, le=4, description="节奏控制")
# 五、提问质量15
question_quality: float = Field(ge=0, le=7, description="问题质量")
follow_up_depth: float = Field(ge=0, le=5, description="追问能力")
non_leading: float = Field(ge=0, le=3, description="非引导性")
total_score: float = Field(ge=0, le=100)
rationale: str = ""
# 与历史 JSON 对齐的一级聚合分(由细项派生,可缺省由模型填写)
emotion_score: float = Field(default=0, ge=0, le=30)
information_score: float = Field(default=0, ge=0, le=20)
structure_score: float = Field(default=0, ge=0, le=10)
question_score: float = Field(default=0, ge=0, le=10)
persona_score: float = Field(default=0, ge=0, le=10)
repetition_score: float = Field(default=0, ge=0, le=10)
naturalness_score: float = Field(default=0, ge=0, le=10)
rationale: str = ""
information_score: float = Field(default=0, ge=0, le=25)
persona_score: float = Field(default=0, ge=0, le=15)
structure_score: float = Field(default=0, ge=0, le=15)
question_score: float = Field(default=0, ge=0, le=15)
@model_validator(mode="after")
def _sync_aggregates_and_total(self) -> Self:
emotion = (
self.emotion_carry
+ self.empathy_depth
+ self.emotion_safety
+ self.emotion_guidance
)
information = (
self.fact_mining + self.info_completeness_guide + self.info_depth_mining
)
persona = (
self.persona_understanding
+ self.persona_consistency_verify
+ self.persona_expression_guide
)
structure = self.interview_structure + self.context_memory + self.rhythm_control
question = self.question_quality + self.follow_up_depth + self.non_leading
expected = emotion + information + persona + structure + question
if abs(expected - self.total_score) > 0.51:
raise ValueError(
f"total_score ({self.total_score}) 与细项合计 ({expected:.2f}) 不一致"
)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "persona_score", persona)
object.__setattr__(self, "structure_score", structure)
object.__setattr__(self, "question_score", question)
return self
class ConversationJudgeOutput(BaseModel):
"""整条对话 transcript 的综合分。"""
total_score: float = Field(ge=0, le=100)
dimension_scores: dict[str, float] = Field(default_factory=dict)
rationale: str = ""
# 整条 transcript 与单轮使用同一套细项
ConversationJudgeOutput = TurnJudgeOutput
class MemoirJudgeOutput(BaseModel):
"""成稿回忆录评分。"""
"""成稿回忆录评分(总分 100子项上限见 rubric"""
# 一、真实性与覆盖(小计最高 23由原 25 收紧)
mem_fidelity: float = Field(ge=0, le=9, description="记忆忠实度")
mem_factual_accuracy: float = Field(ge=0, le=5, description="事实准确性")
mem_factual_coverage: float = Field(ge=0, le=5, description="事实覆盖率")
mem_traceability: float = Field(ge=0, le=4, description="记忆可追溯性")
# 二、信息质量(小计最高 14由原 15 收紧)
info_slot_coverage: float = Field(ge=0, le=6, description="槽位覆盖度")
info_sufficiency: float = Field(ge=0, le=4, description="信息充分性")
info_density: float = Field(ge=0, le=4, description="信息密度")
# 三、叙事结构(小计最高 14由原 15 收紧)
narr_structure: float = Field(ge=0, le=6, description="故事结构")
narr_paragraphs: float = Field(ge=0, le=5, description="段落组织")
narr_pacing: float = Field(ge=0, le=3, description="节奏控制")
# 四、语言与文笔(小计最高 18由原 20 及六项上限一并收紧)
lang_fluency: float = Field(ge=0, le=3, description="语言流畅度")
lang_conciseness: float = Field(ge=0, le=3, description="表达精炼度")
lang_literary: float = Field(ge=0, le=4, description="文笔质量")
lang_controlled_expansion: float = Field(ge=0, le=4, description="控制性扩写能力")
lang_detail: float = Field(ge=0, le=2, description="细节还原与强化")
lang_style: float = Field(ge=0, le=2, description="风格一致性")
# 五、情感表达(小计最高 9由原 10 收紧)
emo_authenticity: float = Field(ge=0, le=5, description="情感真实度")
emo_depth: float = Field(ge=0, le=4, description="情感深度")
# 六、人物建模(小计最高 9由原 10 收紧)
char_understanding: float = Field(ge=0, le=4, description="人物理解")
char_consistency: float = Field(ge=0, le=3, description="人物一致性")
char_integration: float = Field(ge=0, le=2, description="人物融入度")
# 七、连贯性(小计最高 4由原 5 收紧)
coh_timeline: float = Field(ge=0, le=2, description="时间线一致性")
coh_cross_chapter: float = Field(ge=0, le=2, description="跨章节关联")
# 八、表达丰富度(小计最高 5
rich_analogy: float = Field(ge=0, le=3, description="类比与引用")
rich_diversity: float = Field(ge=0, le=2, description="表达多样性")
# 九、出版就绪度(小计最高 4由原 5 收紧)
pub_editorial_cost: float = Field(ge=0, le=2, description="编辑成本")
pub_completeness: float = Field(ge=0, le=2, description="完整度")
total_score: float = Field(ge=0, le=100)
authenticity_score: float = Field(default=0, ge=0, le=25)
information_score: float = Field(default=0, ge=0, le=15)
narrative_score: float = Field(default=0, ge=0, le=15)
language_score: float = Field(default=0, ge=0, le=20)
emotion_score: float = Field(default=0, ge=0, le=10)
character_score: float = Field(default=0, ge=0, le=10)
coherence_score: float = Field(default=0, ge=0, le=5)
richness_score: float = Field(default=0, ge=0, le=5)
publish_ready_score: float = Field(default=0, ge=0, le=5)
rationale: str = ""
authenticity_score: float = Field(default=0, ge=0, le=23)
information_score: float = Field(default=0, ge=0, le=14)
narrative_score: float = Field(default=0, ge=0, le=14)
language_score: float = Field(default=0, ge=0, le=18)
emotion_score: float = Field(default=0, ge=0, le=9)
character_score: float = Field(default=0, ge=0, le=9)
coherence_score: float = Field(default=0, ge=0, le=4)
richness_score: float = Field(default=0, ge=0, le=5)
publish_ready_score: float = Field(default=0, ge=0, le=4)
@model_validator(mode="after")
def _sync_aggregates_and_total(self) -> Self:
authenticity = (
self.mem_fidelity
+ self.mem_factual_accuracy
+ self.mem_factual_coverage
+ self.mem_traceability
)
information = (
self.info_slot_coverage + self.info_sufficiency + self.info_density
)
narrative = self.narr_structure + self.narr_paragraphs + self.narr_pacing
language = (
self.lang_fluency
+ self.lang_conciseness
+ self.lang_literary
+ self.lang_controlled_expansion
+ self.lang_detail
+ self.lang_style
)
emotion = self.emo_authenticity + self.emo_depth
character = (
self.char_understanding + self.char_consistency + self.char_integration
)
coherence = self.coh_timeline + self.coh_cross_chapter
richness = self.rich_analogy + self.rich_diversity
publish = self.pub_editorial_cost + self.pub_completeness
expected = (
authenticity
+ information
+ narrative
+ language
+ emotion
+ character
+ coherence
+ richness
+ publish
)
if abs(expected - self.total_score) > 0.51:
raise ValueError(
f"total_score ({self.total_score}) 与分项合计 ({expected:.2f}) 不一致"
)
object.__setattr__(self, "authenticity_score", authenticity)
object.__setattr__(self, "information_score", information)
object.__setattr__(self, "narrative_score", narrative)
object.__setattr__(self, "language_score", language)
object.__setattr__(self, "emotion_score", emotion)
object.__setattr__(self, "character_score", character)
object.__setattr__(self, "coherence_score", coherence)
object.__setattr__(self, "richness_score", richness)
object.__setattr__(self, "publish_ready_score", publish)
return self

View File

@@ -26,6 +26,37 @@ _CONV_MAX = 8192
_CONV_JUDGE_JSON_MAX = 2048
_MEMOIR_MAX = 12000
_COMPARE_STREAM_MAX = 6144
_MEMOIR_EVIDENCE_MAX = 12000
def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/证据】",
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if reference:
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
return "\n".join(sections)
class EvalJudgeService:
@@ -124,7 +155,7 @@ class EvalJudgeService:
{r_json}
请依次撰写:
1) 两段对话在整体体验上的主要差异(共情、追问、重复感、自然度等);
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
2) B 相对 A 的优点与不足;
3) 若 B 在关键维度明显弱于 A给出可操作的改进方向系统提示、访谈策略、模型或温度等
@@ -154,14 +185,22 @@ class EvalJudgeService:
logger.warning("conversation compare stream failed: {}", e)
yield f"\n\n[流式输出中断:{e}]"
async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
async def judge_memoir(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
if not self._llm:
return None
prompt = f"""{MEMOIR_JUDGE_INSTRUCTIONS}
【回忆录正文】
{memoir_markdown[:_MEMOIR_MAX]}
"""
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
try:
return await allm_json_call(
self._llm,

View File

@@ -1,31 +1,60 @@
"""对话评审 rubric 文本v1"""
"""对话评审 rubric 文本v1 · 访谈「情绪强化版」100 分)。"""
TURN_JUDGE_INSTRUCTIONS = """你是「岁月留书」访谈对话质量评审。根据下面维度给本轮 AI 回复打分0-100 为 total_score各子分上限已注明子分之和应与 total_score 大体一致)。
_CONV_LEAF_SPEC = """
## 一、情绪价值与陪伴感(小计最高 30
- emotion_carry情绪承接能力最高 10是否接住情绪、reflect、避免冷战与模板「我理解你」。
- empathy_depth共情深度最高 8情绪类型是否准、语境贴合、非空洞安慰。
- emotion_safety情绪安全感最高 6非评判、尊重、敏感话题语气、可跳过。
- emotion_guidance情绪引导能力最高 6引导感受、关键节点追问情绪、表达更具体。
维度(参考):
- 情绪承接与共情emotion_score,最高 30
- 信息获取与追问information_score,最高 20
- 结构化访谈推进structure_score最高 10
- 提问质量question_score最高 10
- 人物理解与一致性persona_score最高 10
- 重复抑制repetition_score最高 10是否重复了上 12 轮已问过的问题或同一资料槽;高度重复则低分
- 自然流畅naturalness_score最高 10是否像朋友聊天有无不必要采访腔、总结腔、流程感
## 二、信息获取能力(小计最高 25
- fact_mining关键事实挖掘,最高 8
- info_completeness_guide信息完整性引导,最高 8
- info_depth_mining信息深度挖掘最高 9为何、动机与影响。
输出 JSON**json** 字段名如下:
total_score, emotion_score, information_score, structure_score, question_score, persona_score, repetition_score, naturalness_score, rationale
## 三、人物建模能力(小计最高 15
- persona_understanding人物理解最高 7
- persona_consistency_verify人物一致性验证最高 4矛盾澄清。
- persona_expression_guide人物表达引导最高 4「你是谁」层面。
只输出 JSON。"""
## 四、结构化引导(小计最高 15
- interview_structure访谈结构最高 6阶段与逻辑。
- context_memory上下文记忆最高 5关联前文**重复盘问、同一槽位反复**在此项扣分。
- rhythm_control节奏控制最高 4自然**采访腔、总结腔、流程感**在此项与情绪项综合体现。
## 五、提问质量(小计最高 15
- question_quality问题质量最高 7开放、具体。
- follow_up_depth追问能力最高 5
- non_leading非引导性最高 3
输出 JSON 字段(仅这些键;务必含 rationale
emotion_carry, empathy_depth, emotion_safety, emotion_guidance,
fact_mining, info_completeness_guide, info_depth_mining,
persona_understanding, persona_consistency_verify, persona_expression_guide,
interview_structure, context_memory, rhythm_control,
question_quality, follow_up_depth, non_leading,
total_score, rationale
`total_score` 必须等于上述 15 个细项之和(满分 100
聚合分 emotion_score、information_score、persona_score、structure_score、question_score 可不填(服务端会重算)。
只输出 JSON。
"""
CONV_JUDGE_INSTRUCTIONS = """你是访谈整段对话评审。给定完整 transcript用户与 AI 多轮),打一个综合 total_score0-100
TURN_JUDGE_INSTRUCTIONS = (
"你是「岁月留书」访谈对话质量评审,按下列 **情绪强化版** rubric 为本轮 AI 回复打分。\n"
+ _CONV_LEAF_SPEC
)
dimension_scores 建议至少包含emotion, information, structure, repetition, naturalness各 0-100 相对分量即可),用于反映整段是否重复盘问、是否自然;另可有 rationale。
只输出 JSONtotal_score, dimension_scores, rationale。"""
CONV_JUDGE_INSTRUCTIONS = (
"你是访谈整段对话评审。给定完整 transcript用户与 AI 多轮),按与单轮**相同**的 15 项细项与满分上限,"
"对**整段对话表现**打一次分;`total_score` 为细项之和100\n" + _CONV_LEAF_SPEC
)
COMPARE_CONV_STREAM_HINT = """你是访谈对话评测专家。下面给出一份「回放/新测」完整对话 transcript 及其整体评分JSON。请用中文直接写正文不要用 JSON
1) 对这段对话的整体评价与风险点;
1) 对这段对话的整体评价与风险点(对照情绪承接、信息挖掘、人物、结构、提问质量等)
2) 可操作的改进建议(提示词、流程、模型参数等)。
笔调简洁、可执行。"""

View File

@@ -1,11 +1,73 @@
"""回忆录成稿评审 rubric 文本v1)。"""
"""回忆录成稿评审 rubric 文本v1 · 子项上限合计 100 分制)。
MEMOIR_JUDGE_INSTRUCTIONS = """你是「岁月留书」回忆录成稿评审。根据真实性与覆盖、信息质量、叙事结构、语言文笔、情感、人物、连贯性、表达丰富度、出版就绪等,给出分项分(上限与 total_score 满分 100 一致)。
说明:原产品表九个大类上限之和为 110本 rubric 将各细项上限整档收紧,使九类小计之和为 100
便于与 `total_score` 直接一致,无需再折算。
"""
输出 JSON 字段:
total_score,
authenticity_score, information_score, narrative_score, language_score,
emotion_score, character_score, coherence_score, richness_score, publish_ready_score,
rationale
_MEMOIR_RUBRIC_BODY = """
你必须按下列一级维度与子项及其**满分上限**打分;**全部细项得分之和须等于 `total_score`,且满分合计为 100**。
## 一、真实性与覆盖(小计最高 23
1. mem_fidelity记忆忠实度最高 9hallucination、夸大/弱化/改写、因果关系、未证实推测、AI 补全编造。
2. mem_factual_accuracy事实准确性最高 5时间、人物关系、顺序、内部矛盾、数值细节。
3. mem_factual_coverage事实覆盖率最高 5关键/高情感事件、关键人物与细节是否遗漏。
4. mem_traceability记忆可追溯性最高 4与原始对话映射、来源模糊、证据与语义保持。
## 二、信息质量(小计最高 14
5. info_slot_coverage槽位覆盖度最高 6
6. info_sufficiency信息充分性最高 4
7. info_density信息密度最高 4
## 三、叙事结构(小计最高 14
8. narr_structure故事结构最高 6
9. narr_paragraphs段落组织最高 5
10. narr_pacing节奏控制最高 3
## 四、语言与文笔(小计最高 18
11. lang_fluency语言流畅度最高 3
12. lang_conciseness表达精炼度最高 3
13. lang_literary文笔质量最高 4
14. lang_controlled_expansion控制性扩写最高 4
15. lang_detail细节还原与强化最高 2
16. lang_style风格一致性最高 2
## 五、情感表达(小计最高 9
17. emo_authenticity情感真实度最高 5
18. emo_depth情感深度最高 4
## 六、人物建模(小计最高 9
19. char_understanding人物理解最高 4
20. char_consistency人物一致性最高 3
21. char_integration人物融入度最高 2
## 七、连贯性(小计最高 4
22. coh_timeline时间线一致性最高 2
23. coh_cross_chapter跨章节关联最高 2
## 八、表达丰富度(小计最高 5
24. rich_analogy类比与引用最高 3
25. rich_diversity表达多样性最高 2
## 九、出版就绪度(小计最高 4
26. pub_editorial_cost编辑成本最高 2
27. pub_completeness完整度最高 2
输出 JSON 字段(仅这些键;分值浮点;务必含 rationale 中文简述):
mem_fidelity, mem_factual_accuracy, mem_factual_coverage, mem_traceability,
info_slot_coverage, info_sufficiency, info_density,
narr_structure, narr_paragraphs, narr_pacing,
lang_fluency, lang_conciseness, lang_literary, lang_controlled_expansion, lang_detail, lang_style,
emo_authenticity, emo_depth,
char_understanding, char_consistency, char_integration,
coh_timeline, coh_cross_chapter,
rich_analogy, rich_diversity,
pub_editorial_cost, pub_completeness,
total_score, rationale
一级聚合分 authenticity_score、information_score、narrative_score、language_score、emotion_score、character_score、coherence_score、richness_score、publish_ready_score 可不填(服务端会按细项重算)。
只输出 JSON。"""
MEMOIR_JUDGE_INSTRUCTIONS = (
"你是「岁月留书」回忆录成稿评审,必须严格按照下列 rubric 打分。\n"
+ _MEMOIR_RUBRIC_BODY
)