1. 建立问题库大纲,对应每个人生阶段槽位
2. 鼓励使用更生活化的交流语言共情与总结
3. 降低评审模型可能发生截断的概率
4. 成稿质量维度强化情感表达和上下文连贯性
This commit is contained in:
yangshilin
2026-04-09 15:32:35 +08:00
parent 064ad2161d
commit e1341c6d18
49 changed files with 938 additions and 271 deletions

View File

@@ -141,9 +141,9 @@ def aggregate_lineage_from_segments(
"""
if not segments:
return None
conv0 = conversation_id_fallback or getattr(
segments[0], "conversation_id", None
) or ""
conv0 = (
conversation_id_fallback or getattr(segments[0], "conversation_id", None) or ""
)
if not conv0:
lj0 = getattr(segments[0], "lineage_json", None)
if isinstance(lj0, dict) and lj0.get("conversation_id"):

View File

@@ -62,7 +62,9 @@ class Segment(Base):
tts_audio_urls = Column(JSON, nullable=True)
# 用户轮次 durable message id与 lineage_json 同步;便于查询)
user_message_id = Column(
String, ForeignKey("conversation_messages.id", ondelete="SET NULL"), nullable=True
String,
ForeignKey("conversation_messages.id", ondelete="SET NULL"),
nullable=True,
)
# DialogueLineage JSONschema 见 conversation.lineage_schemas
lineage_json = Column(JSON, nullable=True)

View File

@@ -523,9 +523,7 @@ async def process_audio_segment(
if _is_transcribe_failure(transcript_text):
detail = (transcript_text or "").strip()
if not detail:
user_msg = (
f"分段 {segment_index} 未识别到语音内容,请重试或检查麦克风与网络"
)
user_msg = f"分段 {segment_index} 未识别到语音内容,请重试或检查麦克风与网络"
else:
user_msg = f"分段 {segment_index} 语音识别失败,请稍后再试"
await manager.send_message(
@@ -698,9 +696,7 @@ async def process_user_message(
audio_duration_seconds=audio_dur,
tts_audio_urls=None,
segment_id=segment.id,
memory_retrieval_trace=getattr(
turn, "memory_retrieval_trace", None
),
memory_retrieval_trace=getattr(turn, "memory_retrieval_trace", None),
)
if not turn_ids:
logger.warning(

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
from typing import Any
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import trim_compare_transcript_pair
_GROUP_KEYS: tuple[tuple[str, str], ...] = (
("emotion_score", "情绪与陪伴"),
@@ -44,6 +45,32 @@ def _has_repeat_issue(judge: ConversationJudgeOutput | None) -> bool:
)
def _evidence_quality(truncation: dict[str, Any]) -> dict[str, Any]:
"""结构化说明「分数/对比文在多大程度上覆盖全量对话」,便于客观解读。"""
b_h = not truncation["baseline_truncated_for_conversation"]
r_h = not truncation["replay_truncated_for_conversation"]
pair_full = not (
truncation["baseline_truncated_for_compare"]
or truncation["replay_truncated_for_compare"]
)
if b_h and r_h and pair_full:
scope = "full"
note = "评分与 A/B 对比均基于当前注入的全文(在模型上下文内未再裁对话正文)。"
else:
scope = "partial"
note = (
"存在整段或对比环节截断:分数与流式结论仅反映已提交片段;"
"评审侧已注入截断边界说明,长程细项应保守。发布决策请结合逐轮分、人工抽查或更高上下文预算。"
)
return {
"scope": scope,
"baseline_holistic_covers_full_text": b_h,
"replay_holistic_covers_full_text": r_h,
"ab_compare_covers_full_transcripts": pair_full,
"note_zh": note,
}
def build_conversation_compare_summary(
*,
baseline_judge: ConversationJudgeOutput | None,
@@ -51,29 +78,42 @@ def build_conversation_compare_summary(
baseline_transcript: str,
replay_transcript: str,
conv_cap: int,
compare_cap_each: int,
compare_cap_total: int,
compare_per_side_cap: int | None = None,
fixture_filename: str | None = None,
) -> dict[str, Any]:
_, _, baseline_cmp_trunc, replay_cmp_trunc = trim_compare_transcript_pair(
baseline_transcript,
replay_transcript,
total_max_chars=int(compare_cap_total),
per_side_max_chars=compare_per_side_cap,
)
if compare_per_side_cap and compare_per_side_cap > 0:
each_hint = int(compare_per_side_cap)
else:
each_hint = max(1, int(compare_cap_total) // 2)
truncation = {
"baseline_chars": len((baseline_transcript or "").strip()),
"replay_chars": len((replay_transcript or "").strip()),
"conversation_cap_chars": int(conv_cap),
"compare_cap_each_chars": int(compare_cap_each),
"compare_cap_total_chars": int(compare_cap_total),
"compare_cap_each_chars": each_hint,
"baseline_truncated_for_conversation": len((baseline_transcript or "").strip())
> int(conv_cap),
"replay_truncated_for_conversation": len((replay_transcript or "").strip())
> int(conv_cap),
"baseline_truncated_for_compare": len((baseline_transcript or "").strip())
> int(compare_cap_each),
"replay_truncated_for_compare": len((replay_transcript or "").strip())
> int(compare_cap_each),
"baseline_truncated_for_compare": baseline_cmp_trunc,
"replay_truncated_for_compare": replay_cmp_trunc,
}
evidence_quality = _evidence_quality(truncation)
if not replay_judge:
return {
"fixture_filename": fixture_filename,
"mode": "single",
"truncation": truncation,
"evidence_quality": evidence_quality,
"gate": {
"status": "insufficient_data",
"reasons": ["缺少回放整体评分,无法判断是否追平或超过 A。"],
@@ -86,9 +126,12 @@ def build_conversation_compare_summary(
"mode": "single",
"replay_total": _round(replay_judge.total_score),
"truncation": truncation,
"evidence_quality": evidence_quality,
"gate": {
"status": "single_side_only",
"reasons": ["当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"],
"reasons": [
"当前只有新对话单侧评分,可用于优化,但不能判定是否超过 A。"
],
},
}
@@ -150,8 +193,20 @@ def build_conversation_compare_summary(
reasons.append(f"关键回落维度:{''.join(key_regressions[:4])}")
if key_gains:
reasons.append(f"关键提升维度:{''.join(key_gains[:4])}")
if truncation["baseline_truncated_for_compare"] or truncation["replay_truncated_for_compare"]:
reasons.append("A/B 对比稿使用了截断 transcript长对话结论需结合逐轮评分复核。")
if (
truncation["baseline_truncated_for_compare"]
or truncation["replay_truncated_for_compare"]
):
reasons.append(
"A/B 对比稿使用了截断 transcript长对话结论需结合逐轮评分复核。"
)
if (
truncation["baseline_truncated_for_conversation"]
or truncation["replay_truncated_for_conversation"]
):
reasons.append(
"整段评分可能仅见 transcript 前缀;长程维度已在评审边界下保守处理,请结合逐轮分或全文重跑交叉验证。"
)
return {
"fixture_filename": fixture_filename,
@@ -165,6 +220,7 @@ def build_conversation_compare_summary(
"key_gains": key_gains,
"repeat_issue_detected": has_repeat_regression,
"truncation": truncation,
"evidence_quality": evidence_quality,
"gate": {
"status": status,
"parity_passed": parity_passed,
@@ -173,4 +229,3 @@ def build_conversation_compare_summary(
"golden_set_note": "建议在固定黄金样本集上复跑该口径,再决定是否发布。",
},
}

View File

@@ -59,9 +59,8 @@ def build_segment_transcript(
user_txt = (seg.user_input_text or "").strip()
ai_txt = (ai_by_segment.get(uid) or seg.agent_response or "").strip()
id_extra = _segment_message_id_header(seg)
head = (
f"### Segment {i} · id={uid} · conversation={seg.conversation_id}"
+ (f" · {id_extra}" if id_extra else "")
head = f"### Segment {i} · id={uid} · conversation={seg.conversation_id}" + (
f" · {id_extra}" if id_extra else ""
)
body_u = f"用户: {user_txt}" if user_txt else "用户: (空)"
body_a = f"AI: {ai_txt}" if ai_txt else "AI: (无日志/无 agent_response"

View File

@@ -296,9 +296,9 @@ async def load_summaries_by_ids(
return list(result.scalars().all())
def story_link_ids_by_type(links: list[StoryEvidenceLink]) -> tuple[
list[str], list[str], list[str], list[str]
]:
def story_link_ids_by_type(
links: list[StoryEvidenceLink],
) -> tuple[list[str], list[str], list[str], list[str]]:
chunks: list[str] = []
facts: list[str] = []
timelines: list[str] = []

View File

@@ -121,7 +121,9 @@ class EvalTraceService:
return "partial"
return "fallback"
async def build_chapter_bundle(self, user_id: str, chapter: Chapter) -> ChapterEvidenceBundle:
async def build_chapter_bundle(
self, user_id: str, chapter: Chapter
) -> ChapterEvidenceBundle:
notes: list[str] = []
live_segment_ids = normalize_source_segment_ids(
getattr(chapter, "source_segments", None)
@@ -130,7 +132,15 @@ class EvalTraceService:
row = getattr(chapter, "current_evidence_snapshot", None)
row_has_closure = bool(
(row and (row.segment_ids or []))
or (row and (row.memory_chunk_ids or row.memory_fact_ids or row.timeline_event_ids or row.summary_ids))
or (
row
and (
row.memory_chunk_ids
or row.memory_fact_ids
or row.timeline_event_ids
or row.summary_ids
)
)
)
if (
row is not None
@@ -139,19 +149,13 @@ class EvalTraceService:
and int(row.schema_version or 0) == EVIDENCE_SNAPSHOT_SCHEMA_VERSION
and row_has_closure
):
segment_ids = [
str(x) for x in (row.segment_ids or []) if str(x).strip()
]
segment_ids = [str(x) for x in (row.segment_ids or []) if str(x).strip()]
conv_ids = sorted(
{str(x) for x in (row.conversation_ids or []) if str(x).strip()}
)
chunk_ids = [
str(x) for x in (row.memory_chunk_ids or []) if str(x).strip()
]
chunk_ids = [str(x) for x in (row.memory_chunk_ids or []) if str(x).strip()]
fact_ids = [str(x) for x in (row.memory_fact_ids or []) if str(x).strip()]
tl_ids = [
str(x) for x in (row.timeline_event_ids or []) if str(x).strip()
]
tl_ids = [str(x) for x in (row.timeline_event_ids or []) if str(x).strip()]
sum_ids = [str(x) for x in (row.summary_ids or []) if str(x).strip()]
notes.extend([str(x) for x in (row.notes or []) if x])
notes.append("evidence_from_chapter_evidence_snapshot_table")
@@ -163,7 +167,9 @@ class EvalTraceService:
sum_ids=sum_ids,
)
if live_segment_ids and set(live_segment_ids) != set(segment_ids):
notes.append("live_source_segments_differ_from_snapshot_reconcile_in_pipeline")
notes.append(
"live_source_segments_differ_from_snapshot_reconcile_in_pipeline"
)
dlg = getattr(row, "message_lineage_json", None)
return ChapterEvidenceBundle(
user_id=user_id,
@@ -202,14 +208,24 @@ class EvalTraceService:
)
if use_snap and isinstance(snap, dict):
segment_ids = [str(x) for x in (snap.get("segment_ids") or []) if str(x).strip()]
segment_ids = [
str(x) for x in (snap.get("segment_ids") or []) if str(x).strip()
]
conv_ids = sorted(
{str(x) for x in (snap.get("conversation_ids") or []) if str(x).strip()}
)
chunk_ids = [str(x) for x in (snap.get("memory_chunk_ids") or []) if str(x).strip()]
fact_ids = [str(x) for x in (snap.get("memory_fact_ids") or []) if str(x).strip()]
tl_ids = [str(x) for x in (snap.get("timeline_event_ids") or []) if str(x).strip()]
sum_ids = [str(x) for x in (snap.get("summary_ids") or []) if str(x).strip()]
chunk_ids = [
str(x) for x in (snap.get("memory_chunk_ids") or []) if str(x).strip()
]
fact_ids = [
str(x) for x in (snap.get("memory_fact_ids") or []) if str(x).strip()
]
tl_ids = [
str(x) for x in (snap.get("timeline_event_ids") or []) if str(x).strip()
]
sum_ids = [
str(x) for x in (snap.get("summary_ids") or []) if str(x).strip()
]
notes.extend([str(x) for x in (snap.get("notes") or []) if x])
notes.append("evidence_from_chapter_evidence_bundle_json_column")
tier = self._chapter_closure_tier(
@@ -220,8 +236,12 @@ class EvalTraceService:
sum_ids=sum_ids,
)
if live_segment_ids and set(live_segment_ids) != set(segment_ids):
notes.append("live_source_segments_differ_from_snapshot_reconcile_in_pipeline")
snap_dlg = snap.get("message_lineage_json") if isinstance(snap, dict) else None
notes.append(
"live_source_segments_differ_from_snapshot_reconcile_in_pipeline"
)
snap_dlg = (
snap.get("message_lineage_json") if isinstance(snap, dict) else None
)
return ChapterEvidenceBundle(
user_id=user_id,
chapter_id=str(chapter.id),
@@ -256,8 +276,15 @@ class EvalTraceService:
resolved_seg_ids = [s.id for s in segments] or segment_ids
if len(segments) < len(segment_ids):
notes.append("some_segments_missing_or_foreign_user")
conv_ids = sorted({str(s.conversation_id) for s in segments if s.conversation_id})
chunk_ids, fact_ids, tl_ids, sum_ids = await fetch_memory_closure_for_conversations(
conv_ids = sorted(
{str(s.conversation_id) for s in segments if s.conversation_id}
)
(
chunk_ids,
fact_ids,
tl_ids,
sum_ids,
) = await fetch_memory_closure_for_conversations(
self._db, user_id=user_id, conversation_ids=conv_ids
)
tier = self._chapter_closure_tier(
@@ -339,8 +366,12 @@ class EvalTraceService:
)
return formatted, bundle
async def build_story_bundle(self, user_id: str, story_id: str) -> StoryEvidenceBundle:
st = await get_story_for_eval_trace(self._db, user_id=user_id, story_id=story_id)
async def build_story_bundle(
self, user_id: str, story_id: str
) -> StoryEvidenceBundle:
st = await get_story_for_eval_trace(
self._db, user_id=user_id, story_id=story_id
)
if not st:
return StoryEvidenceBundle(
user_id=user_id,
@@ -378,7 +409,9 @@ class EvalTraceService:
segments = await fetch_segments_for_user(
self._db, user_id=user_id, segment_ids=dedup_seg
)
conv_ids = sorted({str(s.conversation_id) for s in segments if s.conversation_id})
conv_ids = sorted(
{str(s.conversation_id) for s in segments if s.conversation_id}
)
if dedup_seg and not segments:
notes.append("chapter_segment_ids_unresolved")
if conv_ids:
@@ -428,11 +461,16 @@ class EvalTraceService:
segments = await fetch_segments_for_user(
self._db, user_id=user_id, segment_ids=dedup_seg
)
conv_ids = sorted({str(s.conversation_id) for s in segments if s.conversation_id})
chunk_ids, fact_ids, tl_ids, sum_ids = (
await fetch_memory_closure_for_conversations(
self._db, user_id=user_id, conversation_ids=conv_ids
)
conv_ids = sorted(
{str(s.conversation_id) for s in segments if s.conversation_id}
)
(
chunk_ids,
fact_ids,
tl_ids,
sum_ids,
) = await fetch_memory_closure_for_conversations(
self._db, user_id=user_id, conversation_ids=conv_ids
)
notes.append("fallback_lineage_no_story_evidence_links")
notes.append("augmented_with_chapter_context")

View File

@@ -27,7 +27,7 @@ from app.features.evaluation.eval_trace_service import EvalTraceService
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import (
EvalJudgeService,
eval_judge_compare_transcript_each_max_chars_for_context,
eval_judge_compare_bundle_caps,
eval_judge_conversation_transcript_max_chars_for_context,
)
from app.features.evaluation.schemas import MemoirSectionBaselineOut
@@ -234,6 +234,7 @@ class EvalJudgeManualService:
f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
)
_cmp_total, _cmp_per_side = eval_judge_compare_bundle_caps(judge._ctx_tokens)
bundle: dict[str, Any] = {
"version": 1,
"judged_at": datetime.now(timezone.utc).isoformat(),
@@ -250,9 +251,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_cmp_total,
compare_per_side_cap=_cmp_per_side,
fixture_filename=fn,
),
"compare_markdown": "",
@@ -363,6 +363,7 @@ class EvalJudgeManualService:
acc["options"]["judge_model"] = resolved_model
acc["fixture_filename"] = fn
_sse_cmp_total, _sse_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
persist = True
try:
yield {
@@ -435,9 +436,7 @@ class EvalJudgeManualService:
full_transcript=replay_transcript
)
replay_judge = replay_result.output
acc["replay_judge"] = (
replay_judge.model_dump() if replay_judge else None
)
acc["replay_judge"] = replay_judge.model_dump() if replay_judge else None
acc["compare_summary"] = build_conversation_compare_summary(
baseline_judge=baseline_judge,
replay_judge=replay_judge,
@@ -446,9 +445,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_sse_cmp_total,
compare_per_side_cap=_sse_cmp_per,
fixture_filename=fn,
)
yield {
@@ -532,7 +530,9 @@ class EvalJudgeManualService:
fn = (fixture_filename or "").strip() or None
if not fn:
raise EvaluationBadRequestError("请选择基线 MDfixture_filename后再重试基准分")
raise EvaluationBadRequestError(
"请选择基线 MDfixture_filename后再重试基准分"
)
try:
turns, _ = read_user_export_fixture(fn)
@@ -568,6 +568,7 @@ class EvalJudgeManualService:
judge, resolved_model = _make_eval_judge(judge_provider, judge_model)
if not judge:
raise EvaluationBadRequestError(_JUDGE_CONFIG_HINT)
_rt_cmp_total, _rt_cmp_per = eval_judge_compare_bundle_caps(judge._ctx_tokens)
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
@@ -590,9 +591,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_rt_cmp_total,
compare_per_side_cap=_rt_cmp_per,
fixture_filename=fn,
),
"compare_markdown": "",
@@ -619,10 +619,7 @@ class EvalJudgeManualService:
sse_event="baseline_turn_judge",
):
idx = row.get("turn_index")
if (
isinstance(idx, (int, float))
and row.get("judge") is not None
):
if isinstance(idx, (int, float)) and row.get("judge") is not None:
acc["baseline_turn_judges"][str(int(idx))] = row["judge"]
acc["compare_markdown"] = ""
@@ -634,9 +631,8 @@ class EvalJudgeManualService:
conv_cap=eval_judge_conversation_transcript_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_each=eval_judge_compare_transcript_each_max_chars_for_context(
judge._ctx_tokens
),
compare_cap_total=_rt_cmp_total,
compare_per_side_cap=_rt_cmp_per,
fixture_filename=fn,
)
async for piece in judge.stream_conversation_compare(
@@ -682,7 +678,10 @@ class EvalJudgeManualService:
trace_svc = EvalTraceService(self._db)
def _chapter_evidence_notes(
lineage_tier: str, evidence_summary: str, truncated: bool, dropped: list[str]
lineage_tier: str,
evidence_summary: str,
truncated: bool,
dropped: list[str],
) -> str:
drops = ",".join(dropped[:12]) if dropped else ""
return (

View File

@@ -118,7 +118,9 @@ class TurnJudgeOutput(BaseModel):
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
def _cap_str_list(
xs: list[str], *, max_items: int, max_chars: int
) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()
@@ -257,7 +259,9 @@ class MemoirJudgeOutput(BaseModel):
@model_validator(mode="after")
def _cap_meta_fields_and_sync_totals(self) -> Self:
def _cap_str_list(xs: list[str], *, max_items: int, max_chars: int) -> list[str]:
def _cap_str_list(
xs: list[str], *, max_items: int, max_chars: int
) -> list[str]:
out: list[str] = []
for x in xs[:max_items]:
s = str(x).strip()

View File

@@ -90,26 +90,127 @@ def eval_judge_turn_prior_transcript_max_chars_for_context(
def eval_judge_compare_transcript_each_max_chars() -> int:
"""A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(默认 GLM 上下文)。"""
if settings.eval_judge_max_compare_transcript_chars_each > 0:
return settings.eval_judge_max_compare_transcript_chars_each
pool = (
_eval_judge_prompt_char_pool()
- settings.eval_judge_compare_prompt_overhead_chars
"""单侧对称参考上限(默认与 settings.eval_judge_context_window_tokens 一致)。"""
return eval_judge_compare_transcript_each_max_chars_for_context(
settings.eval_judge_context_window_tokens
)
return max(1, pool // 2)
def eval_judge_compare_transcript_pair_total_budget_for_context(
context_window_tokens: int,
) -> int:
"""A/B 同 prompt 时,两份 transcript 合计最大字符数(已扣对比模板与双份 JSON 等开销)。"""
if settings.eval_judge_max_compare_transcript_chars_each > 0:
return max(1, 2 * int(settings.eval_judge_max_compare_transcript_chars_each))
pool = _eval_judge_prompt_char_pool_for_context(context_window_tokens)
return max(1, pool - int(settings.eval_judge_compare_prompt_overhead_chars))
def eval_judge_compare_transcript_each_max_chars_for_context(
context_window_tokens: int,
) -> int:
"""单侧对称上限的参考值auto 模式下约为合计预算的一半;供兼容与展示)。"""
if settings.eval_judge_max_compare_transcript_chars_each > 0:
return settings.eval_judge_max_compare_transcript_chars_each
pool = (
_eval_judge_prompt_char_pool_for_context(context_window_tokens)
- settings.eval_judge_compare_prompt_overhead_chars
return int(settings.eval_judge_max_compare_transcript_chars_each)
total = eval_judge_compare_transcript_pair_total_budget_for_context(
context_window_tokens
)
return max(1, pool // 2)
return max(1, total // 2)
def eval_judge_compare_bundle_caps(
context_window_tokens: int,
) -> tuple[int, int | None]:
"""返回 (compare_cap_total, per_side_cap|None),供 Playground 摘要与流式对比共用。"""
per = int(settings.eval_judge_max_compare_transcript_chars_each or 0)
if per > 0:
return max(1, 2 * per), per
return eval_judge_compare_transcript_pair_total_budget_for_context(
context_window_tokens
), None
def trim_compare_transcript_pair(
baseline: str,
replay: str,
*,
total_max_chars: int,
per_side_max_chars: int | None = None,
) -> tuple[str, str, bool, bool]:
"""A/B 对比 prompt 用:在合计预算内尽量保留全文;仅超长时优先从较长的一侧裁尾部。
若配置了 eval_judge_max_compare_transcript_chars_each则仍按单侧硬顶与旧行为一致
"""
b = (baseline or "").strip()
r = (replay or "").strip()
if per_side_max_chars is not None and int(per_side_max_chars) > 0:
cap = int(per_side_max_chars)
return b[:cap], r[:cap], len(b) > cap, len(r) > cap
cap_total = max(1, int(total_max_chars))
if len(b) + len(r) <= cap_total:
return b, r, False, False
need_drop = len(b) + len(r) - cap_total
b2, r2 = b, r
while need_drop > 0 and (b2 or r2):
if len(b2) >= len(r2):
if b2:
b2 = b2[:-1]
need_drop -= 1
elif r2:
r2 = r2[:-1]
need_drop -= 1
else:
break
else:
if r2:
r2 = r2[:-1]
need_drop -= 1
elif b2:
b2 = b2[:-1]
need_drop -= 1
else:
break
return b2, r2, len(b) > len(b2), len(r) > len(r2)
_CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL = (
"\n\n【评审边界——输入已为截断稿】\n"
"以上仅为全文前 {n} 个字符,其后未提供给模型。"
"对依赖长程多轮轨迹的细项(尤其 context_memory、interview_structure、跨轮重复盘问"
"必须保守给分(倾向区间中低),并在 insufficient_evidence 写明「输入为截断稿,长程证据不足」;"
"不得臆断未展示轮次中的行为confidence 须显著降低;禁止因未见问题而默认高分或推断后半段无缺陷。\n"
)
_TURN_PRIOR_TRUNCATION_TAIL = (
"\n\n【评审边界——上文节选已截断】\n"
"「截至上一轮」节选可能仅为更长对话的前 {n} 字;跨轮重复、长程结构若无法从节选核实,"
"须在 insufficient_evidence 说明,并对相关细项保守给分。\n"
)
_COMPARE_STREAM_PAIR_TRUNCATION_NOTE = (
"\n【评审边界】以下 A/B transcript 至少一侧为截断稿,请仅就**已展示片段**比较;"
"不得断言未展示轮次的优劣;涉及跨轮重复盘问等须明确证据范围或说不足以判断。\n"
)
def conversation_judge_transcript_excerpt(full_transcript: str, cap: int) -> str:
"""整段评审:在 cap 内截断时在正文后附加边界说明,减少「假装看了全文」的幻觉打分。"""
raw = (full_transcript or "").strip()
c = max(0, int(cap))
if len(raw) <= c:
return raw
return raw[:c] + _CONV_JUDGE_TRANSCRIPT_TRUNCATION_TAIL.format(n=c)
def turn_judge_prior_excerpt(prior_transcript: str, cap: int) -> str:
"""逐轮评审里「截至上一轮」节选;截断时附加边界说明。"""
raw = (prior_transcript or "").strip()
c = max(0, int(cap))
if len(raw) <= c:
return raw
return raw[:c] + _TURN_PRIOR_TRUNCATION_TAIL.format(n=c)
@dataclass(slots=True)
@@ -163,9 +264,7 @@ def _build_memoir_judge_prompt(
]
)
if struct:
sections.extend(
["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""]
)
sections.extend(["【结构化记忆证据】", struct[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
@@ -198,14 +297,7 @@ class EvalJudgeService:
)
def _turn_prior_cap(self) -> int:
return eval_judge_turn_prior_transcript_max_chars_for_context(
self._ctx_tokens
)
def _compare_each_cap(self) -> int:
return eval_judge_compare_transcript_each_max_chars_for_context(
self._ctx_tokens
)
return eval_judge_turn_prior_transcript_max_chars_for_context(self._ctx_tokens)
async def judge_turn(
self,
@@ -223,7 +315,7 @@ class EvalJudgeService:
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致。evidence_refs.turn_index 请使用该编号。
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
{prior_transcript[: self._turn_prior_cap()]}
{turn_judge_prior_excerpt(prior_transcript, self._turn_prior_cap())}
【本轮用户】
{user_utterance[:4000]}
@@ -254,7 +346,7 @@ class EvalJudgeService:
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
【完整对话】(每轮以 `[Turn k]` 开头)
{full_transcript[: self._conv_transcript_cap()]}
{conversation_judge_transcript_excerpt(full_transcript, self._conv_transcript_cap())}
"""
try:
out = await allm_json_call(
@@ -288,10 +380,15 @@ class EvalJudgeService:
if not self._llm:
yield "[错误] 未配置评审模型 API Key智谱eval_judge_api_key / zhipu_api_keyDeepSeekdeepseek_api_key"
return
cap_each = self._compare_each_cap()
cap_total, per_side = eval_judge_compare_bundle_caps(self._ctx_tokens)
cap_single = self._conv_transcript_cap()
b_tr = (baseline_transcript or "").strip()[:cap_each]
r_tr = (replay_transcript or "").strip()[:cap_each]
b_tr, r_tr, b_cmp_trunc, r_cmp_trunc = trim_compare_transcript_pair(
baseline_transcript or "",
replay_transcript or "",
total_max_chars=cap_total,
per_side_max_chars=per_side,
)
compare_pair_truncated = b_cmp_trunc or r_cmp_trunc
b_json = (
baseline_judge.model_dump_json(ensure_ascii=False)
if baseline_judge
@@ -301,14 +398,17 @@ class EvalJudgeService:
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
)
if baseline_judge and replay_judge:
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分JSON。请用中文直接写正文不要用 JSON、不要用 Markdown 代码块):
trunc_line = (
_COMPARE_STREAM_PAIR_TRUNCATION_NOTE if compare_pair_truncated else ""
)
prompt = f"""你是访谈对话评测专家。下面给出两份对话 transcript 及各自的整体打分JSON。请用中文直接写正文不要用 JSON、不要用 Markdown 代码块):
【A导出基准对话】历史快照用户与当时导出的线上 AI多轮合并为一篇
{b_tr}
【B本次回放/新测对话】用户句与基准对齐AI 为当前后端重新生成)
{r_tr}
{trunc_line}
【A 的整体评分 JSON】
{b_json}
@@ -322,7 +422,9 @@ class EvalJudgeService:
笔调简洁、偏执行清单。"""
elif replay_judge:
r_one = (replay_transcript or "").strip()[:cap_single]
r_one = conversation_judge_transcript_excerpt(
replay_transcript or "", cap_single
)
prompt = f"""{COMPARE_CONV_STREAM_HINT}
【回放/新测 transcript】

View File

@@ -45,7 +45,8 @@ class MemoirReadinessService:
missing = [i for i in ids if i not in found_ids]
if missing:
raise EvaluationBadRequestError(
"segment not in conversation: " + ", ".join(missing[:5])
"segment not in conversation: "
+ ", ".join(missing[:5])
+ ("" if len(missing) > 5 else "")
)

View File

@@ -165,11 +165,7 @@ class ReplayConversationService:
)
count += 1
if (
flush_memoir_after
and conv.user_id
and (not skip_memoir)
):
if flush_memoir_after and conv.user_id and (not skip_memoir):
await background_runner.flush_pending(conv.user_id)
logger.info(

View File

@@ -173,9 +173,7 @@ async def get_playground_conversation_judge(
async def memoir_phase1_ready(
conversation_id: str,
_auth: InternalEvalAuth,
svc: Annotated[
MemoirReadinessService, Depends(get_memoir_readiness_service)
],
svc: Annotated[MemoirReadinessService, Depends(get_memoir_readiness_service)],
segment_ids: Annotated[
list[str],
Query(
@@ -202,9 +200,7 @@ async def memoir_phase1_ready(
async def memoir_submit_phase1(
conversation_id: str,
_auth: InternalEvalAuth,
svc: Annotated[
MemoirReadinessService, Depends(get_memoir_readiness_service)
],
svc: Annotated[MemoirReadinessService, Depends(get_memoir_readiness_service)],
):
try:
return await svc.submit_memoir_phase1_for_conversation(

View File

@@ -21,8 +21,9 @@ _TURN_SCOPE = """
_CONV_SCOPE = """
## 整段对话评审范围
- 在**完整 transcript**上,对 AI **多轮轨迹**做一次 holistic 评分(仍为同一 15 细项)。
- **聚合规则**:以「整段中**典型表现** + **最严重且反复出现的缺陷**」综合定档;若某维度在多轮中明显滑落,该维不得按最好一轮给满分。
- 在输入所给的 transcript(含 `[Turn k]`上,对 AI **多轮轨迹**做一次 holistic 评分(仍为同一 15 细项)。
- 若正文后出现系统注入的「【评审边界——输入已为截断稿】」说明,则**只据此片段**评分:长程细项须保守,`confidence` 降低,并在 `insufficient_evidence` 声明证据范围;**禁止**臆断未展示轮次、**禁止**因未见缺陷而默认高分。
- **聚合规则**:以**已展示轮次中**典型表现 + 最严重且反复的缺陷综合定档;若某维度在多轮中明显滑落,该维不得按最好一轮给满分。
- 维度边界:`context_memory` 负责**重复盘问、前后矛盾追问、忽略已答信息**`emotion_carry` 负责**情绪是否被接住**(不与采访腔混扣);`rhythm_control` 负责**采访腔、总结腔、机械流程感**(本轮已承接情绪但仍像审讯,在此项体现)。
"""

View File

@@ -31,7 +31,9 @@ def format_export_turns_with_labels(turns: list[tuple[str, str]]) -> str:
return "\n\n".join(parts)
def pair_session_messages_to_turns(messages: list[_MessageLike] | list[Any]) -> list[tuple[str, str]]:
def pair_session_messages_to_turns(
messages: list[_MessageLike] | list[Any],
) -> list[tuple[str, str]]:
"""将对话消息序列为 (user, assistant) 轮次列表,语义与 `format_session_messages_with_turn_labels` 一致。
末尾仅有 human、无紧随 assistant 时,补一轮 (user, "") 供 UI 与评审对齐。
@@ -56,7 +58,9 @@ def pair_session_messages_to_turns(messages: list[_MessageLike] | list[Any]) ->
return out
def format_session_messages_with_turn_labels(messages: list[_MessageLike] | list[Any]) -> str:
def format_session_messages_with_turn_labels(
messages: list[_MessageLike] | list[Any],
) -> str:
"""会话消息序列:按出现顺序将相邻 human→assistant 合并为一轮。"""
blocks: list[str] = []
turn_idx = 0

View File

@@ -195,7 +195,9 @@ def refresh_chapter_evidence_snapshot_sync(session: Session, chapter_id: str) ->
chapter_id=str(ch.id),
user_id=str(ch.user_id),
version_no=next_v,
schema_version=int(payload.get("schema_version") or EVIDENCE_SNAPSHOT_SCHEMA_VERSION),
schema_version=int(
payload.get("schema_version") or EVIDENCE_SNAPSHOT_SCHEMA_VERSION
),
segment_ids=list(payload.get("segment_ids") or []),
conversation_ids=list(payload.get("conversation_ids") or []),
story_ids=list(payload.get("story_ids") or []),
@@ -209,7 +211,9 @@ def refresh_chapter_evidence_snapshot_sync(session: Session, chapter_id: str) ->
)
session.add(snap)
session.flush()
_replace_chapter_evidence_links_sync(session, chapter_id=str(ch.id), payload=payload)
_replace_chapter_evidence_links_sync(
session, chapter_id=str(ch.id), payload=payload
)
ch.current_evidence_snapshot_id = snap.id
ch.evidence_bundle_json = payload
if payload.get("message_lineage_json") is not None:

View File

@@ -98,7 +98,9 @@ def _dialogue_lineage_dict_for_segment_ids(
)
def _evidence_link_ids(evidence: dict) -> tuple[list[str], list[str], list[str], list[str]]:
def _evidence_link_ids(
evidence: dict,
) -> tuple[list[str], list[str], list[str], list[str]]:
"""从 retrieve_evidence_sync 结果提取稳定 ID 列表。"""
chunks: list[str] = []
for c in evidence.get("relevant_chunks") or []:
@@ -661,9 +663,7 @@ def _resolve_append_target(
and len(oral_norm)
<= int(settings.memoir_story_route_append_guardrail_oral_chars)
):
tid_g = default_append_target_story_id(
candidate_stories, story_meta, settings
)
tid_g = default_append_target_story_id(candidate_stories, story_meta, settings)
if tid_g:
st = session.get(Story, tid_g)
if st and st.user_id == user_id:
@@ -880,17 +880,19 @@ def _run_batch_plan_writes(
unit_text = _ordered_text_for_segment_ids(category_segments, unit.segment_ids)
oral_unit = normalize_oral_for_memoir(unit_text, llm=llm)
target_story_id, existing_for_narrative, decision_source = _resolve_append_target(
session,
route_decision=unit.decision,
route_target_story_id=unit.target_story_id,
user_id=user_id,
chapter_category=chapter_category,
oral_norm=(oral_unit or "").strip(),
candidate_stories=candidate_stories,
story_meta=story_meta,
decision_source="batch_plan",
memoir_correlation_id=memoir_correlation_id,
target_story_id, existing_for_narrative, decision_source = (
_resolve_append_target(
session,
route_decision=unit.decision,
route_target_story_id=unit.target_story_id,
user_id=user_id,
chapter_category=chapter_category,
oral_norm=(oral_unit or "").strip(),
candidate_stories=candidate_stories,
story_meta=story_meta,
decision_source="batch_plan",
memoir_correlation_id=memoir_correlation_id,
)
)
sid, _ = _execute_narrative_unit(
@@ -1104,17 +1106,19 @@ def run_story_pipeline_for_category_batch(
)
decision_source = "fallback_no_llm" if not llm else "single_decide"
target_story_id, existing_for_narrative, decision_source = _resolve_append_target(
session,
route_decision=route.decision,
route_target_story_id=route.target_story_id,
user_id=user_id,
chapter_category=chapter_category,
oral_norm=om_norm,
candidate_stories=candidates,
story_meta=story_meta,
decision_source=decision_source,
memoir_correlation_id=memoir_correlation_id,
target_story_id, existing_for_narrative, decision_source = (
_resolve_append_target(
session,
route_decision=route.decision,
route_target_story_id=route.target_story_id,
user_id=user_id,
chapter_category=chapter_category,
oral_norm=om_norm,
candidate_stories=candidates,
story_meta=story_meta,
decision_source=decision_source,
memoir_correlation_id=memoir_correlation_id,
)
)
sid, _ = _execute_narrative_unit(

View File

@@ -1,7 +1,6 @@
"""Transcript chunker — split raw text into retrieval-ready chunks."""
def chunk_transcript(
text: str, *, max_chars: int = 800, overlap_chars: int = 100
) -> list[str]:

View File

@@ -21,7 +21,9 @@ from app.features.memory.repo import (
set_memory_fact_status,
update_chunk_embedding,
)
from app.features.conversation.lineage_schemas import primary_user_message_id_from_lineage
from app.features.conversation.lineage_schemas import (
primary_user_message_id_from_lineage,
)
from app.features.memory.schemas import EvidenceBundle
from app.ports.embedding import EmbeddingProvider
@@ -55,9 +57,7 @@ class MemoryService:
raise ValueError("transcript cannot be empty")
primary_mid = (
primary_user_message_id_from_lineage(lineage_json)
if lineage_json
else None
primary_user_message_id_from_lineage(lineage_json) if lineage_json else None
)
source = await create_source(
self._db,

View File

@@ -1,6 +1,5 @@
"""Tasks feature 依赖:提供 get_tasks_service。"""
from app.features.tasks.service import TasksService