Chat 访谈 - 新增 persona 系统(default / warm_listener / curious_guide)与 background_voice 语气层 - 回复长度由 compute_reply_plan 统一决策(brief / standard / expanded),融合信息密度启发式 - 输入净稿(input_normalize):编排层可选 rules/llm 归一用户口语后再喂模型与记忆检索 - 记忆证据注入:按用户话检索 memory evidence 并注入 prompt Memoir 回忆录 - 口述归一(oral_normalize):segment 原文保留,story 管线取派生净稿作叙事输入 - segment 入队批次门闸:累计字数 + 最长等待秒数,减少零碎提交 - fidelity_check / prompts / narrative_agent 微调 - Alembic 0005:清理跨章节 story 外键 Infra - Dockerfile 加入 ffmpeg - pyproject.toml 新增依赖并同步 uv.lock - .env.example / .env.production 补全新配置项 Tests - 新增 test_background_voice、test_chat_input_normalize、test_experience_regressions - 扩展 test_interview_prompts、test_interview_reply_length、test_story_route_oral_invariant Made-with: Cursor
184 lines
6.5 KiB
Python
184 lines
6.5 KiB
Python
"""
|
||
ClassificationAgent:将内容分类到 8 个章节类别之一。
|
||
|
||
原「LLM 返回 none / 零散档案启发式」不再跳过 Story:统一映射为 ``summary`` 章节,
|
||
仍走叙事流水线落库;与 StoryRoute 仍兼容(批次内 new/append 规划不变)。
|
||
Memory ingest 由 Celery 任务在批次级先行完成,与分类结果独立。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from dataclasses import dataclass
|
||
from typing import Any
|
||
|
||
from app.agents.memoir.prompts import (
|
||
CHAPTER_CATEGORIES,
|
||
get_chapter_classification_json_prompt,
|
||
)
|
||
from app.core.langchain_llm import invoke_json_object
|
||
from app.core.logging import get_logger
|
||
from app.features.memoir.memoir_images.json_payload import extract_json_payload
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
# 模型判定 none 或启发式命中零散档案时,仍写入回忆录正文所用的兜底章节
|
||
_SUMMARY_FALLBACK_CATEGORY = "summary"
|
||
|
||
# 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断
|
||
_FRAGMENT_SHORT_MAX_LEN = 48
|
||
|
||
# 整段仅为出生年份/年份声明(零散档案,不成故事)
|
||
_BIRTH_YEAR_LINE = re.compile(
|
||
r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!!]?[\s\u200b]*$",
|
||
re.UNICODE,
|
||
)
|
||
|
||
# 极短且为「我是某地人」式籍贯标签,无过程描写
|
||
_SHORT_HUKOU_STYLE = re.compile(
|
||
r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!!]?[\s\u200b]*$",
|
||
re.UNICODE,
|
||
)
|
||
|
||
# 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别
|
||
STAGE_KEYWORDS = {
|
||
"childhood": ["童年", "小时候", "家乡", "小镇"],
|
||
"education": ["上学", "学校", "老师", "同学", "教育", "大学"],
|
||
"career": ["工作", "职业", "事业", "公司", "同事", "创业"],
|
||
"family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"],
|
||
"belief": ["信念", "价值观", "座右铭", "坚持", "原则"],
|
||
}
|
||
|
||
# 5-stage → 默认 8-category 映射(LLM 分类失败时的兜底)
|
||
_STAGE_TO_DEFAULT_CATEGORY = {
|
||
"childhood": "childhood",
|
||
"education": "education",
|
||
"career": "career_early",
|
||
"family": "family",
|
||
"belief": "beliefs",
|
||
}
|
||
|
||
|
||
def _detect_stage(text: str, fallback_stage: str) -> str:
|
||
"""根据关键词检测消息所属的 5-stage 阶段"""
|
||
message = (text or "").lower()
|
||
for stage, keywords in STAGE_KEYWORDS.items():
|
||
if any(word in message for word in keywords):
|
||
return stage
|
||
return fallback_stage
|
||
|
||
|
||
def _looks_like_fragment_only(text: str) -> bool:
|
||
"""
|
||
保守启发式:明显为档案点/标签句。
|
||
命中时仍进回忆录正文,章节映射为 ``summary``(与 LLM 返回 none 一致)。
|
||
"""
|
||
s = (text or "").strip()
|
||
if not s:
|
||
return True
|
||
if _BIRTH_YEAR_LINE.match(s):
|
||
return True
|
||
if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _normalize_llm_category(raw: str) -> str:
|
||
"""去掉模型偶发的引号、反引号包裹。"""
|
||
s = (raw or "").strip().lower()
|
||
if s.startswith("`"):
|
||
s = s.strip("`").strip()
|
||
if (s.startswith('"') and s.endswith('"')) or (
|
||
s.startswith("'") and s.endswith("'")
|
||
):
|
||
s = s[1:-1].strip()
|
||
return s
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class ChapterClassifyResult:
|
||
"""章节分类结果;``llm_said_none`` 仅当走 LLM 且解析为 none 时为 True(fragment 启发式不为 True)。"""
|
||
|
||
category: str
|
||
llm_said_none: bool = False
|
||
|
||
|
||
def _parse_category_from_llm_response(raw: str) -> str:
|
||
"""优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。"""
|
||
s = (raw or "").strip()
|
||
if not s:
|
||
return ""
|
||
try:
|
||
data = json.loads(extract_json_payload(s))
|
||
if isinstance(data, dict) and "category" in data:
|
||
return _normalize_llm_category(str(data["category"]))
|
||
except (json.JSONDecodeError, TypeError, ValueError):
|
||
pass
|
||
return _normalize_llm_category(s)
|
||
|
||
|
||
class ClassificationAgent:
|
||
"""将内容分类到 8 个章节类别之一;none/零散档案映射为 ``summary`` 仍进 Story。"""
|
||
|
||
def classify(
|
||
self,
|
||
text: str,
|
||
fallback_stage: str,
|
||
llm: Any,
|
||
*,
|
||
segment_id: str | None = None,
|
||
) -> ChapterClassifyResult:
|
||
"""
|
||
分类到 8 个章节类别之一。
|
||
LLM 返回 none 或启发式为零散档案时,``category`` 为 ``summary``(仍可走回忆录流水线;
|
||
``llm_said_none`` 仅在 LLM 明确返回 none 时为 True,供空转抑制判断)。
|
||
llm 需支持 .invoke(prompt) 同步调用。
|
||
"""
|
||
if _looks_like_fragment_only(text):
|
||
logger.info(
|
||
"event=chapter_classification_summary_fallback reason=fragment_heuristic "
|
||
"segment_id={} text_len={} category={}",
|
||
segment_id or "",
|
||
len(text or ""),
|
||
_SUMMARY_FALLBACK_CATEGORY,
|
||
)
|
||
return ChapterClassifyResult(
|
||
category=_SUMMARY_FALLBACK_CATEGORY,
|
||
llm_said_none=False,
|
||
)
|
||
|
||
if llm:
|
||
try:
|
||
prompt = get_chapter_classification_json_prompt(text)
|
||
raw = invoke_json_object(
|
||
llm,
|
||
prompt,
|
||
max_tokens=256,
|
||
agent="ClassificationAgent.classify",
|
||
)
|
||
category = _parse_category_from_llm_response(raw)
|
||
if category == "none":
|
||
logger.info(
|
||
"event=chapter_classification_summary_fallback reason=llm_none "
|
||
"segment_id={} text_len={} category={}",
|
||
segment_id or "",
|
||
len(text or ""),
|
||
_SUMMARY_FALLBACK_CATEGORY,
|
||
)
|
||
return ChapterClassifyResult(
|
||
category=_SUMMARY_FALLBACK_CATEGORY,
|
||
llm_said_none=True,
|
||
)
|
||
if category in CHAPTER_CATEGORIES:
|
||
return ChapterClassifyResult(category=category, llm_said_none=False)
|
||
except Exception as e:
|
||
logger.warning("ClassificationAgent LLM 章节分类失败: {}", e)
|
||
|
||
stage = _detect_stage(text, fallback_stage)
|
||
cat = _STAGE_TO_DEFAULT_CATEGORY.get(
|
||
stage,
|
||
_STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"),
|
||
)
|
||
return ChapterClassifyResult(category=cat, llm_said_none=False)
|