2026-03-19 10:38:11 +08:00
|
|
|
|
"""
|
2026-03-27 16:01:28 +08:00
|
|
|
|
ClassificationAgent:将内容分类到 8 个章节类别之一。
|
2026-03-23 13:54:41 +08:00
|
|
|
|
|
2026-03-27 16:01:28 +08:00
|
|
|
|
原「LLM 返回 none / 零散档案启发式」不再跳过 Story:统一映射为 ``summary`` 章节,
|
|
|
|
|
|
仍走叙事流水线落库;与 StoryRoute 仍兼容(批次内 new/append 规划不变)。
|
|
|
|
|
|
Memory ingest 由 Celery 任务在批次级先行完成,与分类结果独立。
|
2026-03-19 10:38:11 +08:00
|
|
|
|
"""
|
2026-03-19 14:36:14 +08:00
|
|
|
|
|
2026-03-19 10:38:11 +08:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-03-26 12:13:36 +08:00
|
|
|
|
import json
|
2026-03-23 13:54:41 +08:00
|
|
|
|
import re
|
2026-03-31 23:55:26 +08:00
|
|
|
|
from dataclasses import dataclass
|
2026-03-27 16:01:28 +08:00
|
|
|
|
from typing import Any
|
2026-03-19 10:38:11 +08:00
|
|
|
|
|
2026-04-08 15:37:09 +08:00
|
|
|
|
from pydantic import ValidationError
|
|
|
|
|
|
|
2026-04-02 12:00:00 +08:00
|
|
|
|
from app.agents.memoir.prompts import get_chapter_classification_json_prompt
|
2026-04-03 13:34:27 +08:00
|
|
|
|
from app.agents.memoir.schemas import ClassificationOutput
|
2026-04-02 16:37:14 +08:00
|
|
|
|
from app.agents.stage_constants import (
|
|
|
|
|
|
CHAPTER_CATEGORIES,
|
|
|
|
|
|
STAGE_KEYWORD_WEIGHTS,
|
|
|
|
|
|
STAGE_TO_DEFAULT_CATEGORY,
|
|
|
|
|
|
)
|
2026-04-03 13:34:27 +08:00
|
|
|
|
from app.core.config import settings
|
2026-04-02 12:00:00 +08:00
|
|
|
|
from app.core.json_utils import extract_json_payload
|
2026-04-08 15:37:09 +08:00
|
|
|
|
from app.core.llm_call import LLMCallError, llm_json_call
|
2026-03-22 16:45:57 +08:00
|
|
|
|
from app.core.logging import get_logger
|
2026-05-22 13:44:50 +08:00
|
|
|
|
from app.features.memoir.constants import memoir
|
2026-03-19 10:38:11 +08:00
|
|
|
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
2026-03-27 16:01:28 +08:00
|
|
|
|
# 模型判定 none 或启发式命中零散档案时,仍写入回忆录正文所用的兜底章节
|
|
|
|
|
|
_SUMMARY_FALLBACK_CATEGORY = "summary"
|
|
|
|
|
|
|
2026-03-23 13:54:41 +08:00
|
|
|
|
# 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断
|
|
|
|
|
|
_FRAGMENT_SHORT_MAX_LEN = 48
|
|
|
|
|
|
|
|
|
|
|
|
# 整段仅为出生年份/年份声明(零散档案,不成故事)
|
|
|
|
|
|
_BIRTH_YEAR_LINE = re.compile(
|
|
|
|
|
|
r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!!]?[\s\u200b]*$",
|
|
|
|
|
|
re.UNICODE,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 极短且为「我是某地人」式籍贯标签,无过程描写
|
|
|
|
|
|
_SHORT_HUKOU_STYLE = re.compile(
|
|
|
|
|
|
r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!!]?[\s\u200b]*$",
|
|
|
|
|
|
re.UNICODE,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-03-19 10:38:11 +08:00
|
|
|
|
|
|
|
|
|
|
def _detect_stage(text: str, fallback_stage: str) -> str:
|
2026-04-02 16:37:14 +08:00
|
|
|
|
"""根据关键词检测消息所属的 5-stage 阶段(与 stage_constants.STAGE_KEYWORD_WEIGHTS 同源;匹配方式为子串,非加权)。"""
|
2026-03-19 10:38:11 +08:00
|
|
|
|
message = (text or "").lower()
|
2026-04-02 16:37:14 +08:00
|
|
|
|
for stage, pairs in STAGE_KEYWORD_WEIGHTS.items():
|
|
|
|
|
|
if any(word in message for word, _w in pairs):
|
2026-03-19 10:38:11 +08:00
|
|
|
|
return stage
|
|
|
|
|
|
return fallback_stage
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-23 13:54:41 +08:00
|
|
|
|
def _looks_like_fragment_only(text: str) -> bool:
|
|
|
|
|
|
"""
|
2026-03-27 16:01:28 +08:00
|
|
|
|
保守启发式:明显为档案点/标签句。
|
|
|
|
|
|
命中时仍进回忆录正文,章节映射为 ``summary``(与 LLM 返回 none 一致)。
|
2026-03-23 13:54:41 +08:00
|
|
|
|
"""
|
|
|
|
|
|
s = (text or "").strip()
|
|
|
|
|
|
if not s:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if _BIRTH_YEAR_LINE.match(s):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_llm_category(raw: str) -> str:
|
|
|
|
|
|
"""去掉模型偶发的引号、反引号包裹。"""
|
|
|
|
|
|
s = (raw or "").strip().lower()
|
|
|
|
|
|
if s.startswith("`"):
|
|
|
|
|
|
s = s.strip("`").strip()
|
|
|
|
|
|
if (s.startswith('"') and s.endswith('"')) or (
|
|
|
|
|
|
s.startswith("'") and s.endswith("'")
|
|
|
|
|
|
):
|
|
|
|
|
|
s = s[1:-1].strip()
|
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-31 23:55:26 +08:00
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
|
|
class ChapterClassifyResult:
|
|
|
|
|
|
"""章节分类结果;``llm_said_none`` 仅当走 LLM 且解析为 none 时为 True(fragment 启发式不为 True)。"""
|
|
|
|
|
|
|
|
|
|
|
|
category: str
|
|
|
|
|
|
llm_said_none: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-26 12:13:36 +08:00
|
|
|
|
def _parse_category_from_llm_response(raw: str) -> str:
|
|
|
|
|
|
"""优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。"""
|
|
|
|
|
|
s = (raw or "").strip()
|
|
|
|
|
|
if not s:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
try:
|
|
|
|
|
|
data = json.loads(extract_json_payload(s))
|
|
|
|
|
|
if isinstance(data, dict) and "category" in data:
|
|
|
|
|
|
return _normalize_llm_category(str(data["category"]))
|
|
|
|
|
|
except (json.JSONDecodeError, TypeError, ValueError):
|
|
|
|
|
|
pass
|
|
|
|
|
|
return _normalize_llm_category(s)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-19 10:38:11 +08:00
|
|
|
|
class ClassificationAgent:
|
2026-03-27 16:01:28 +08:00
|
|
|
|
"""将内容分类到 8 个章节类别之一;none/零散档案映射为 ``summary`` 仍进 Story。"""
|
2026-03-19 10:38:11 +08:00
|
|
|
|
|
|
|
|
|
|
def classify(
|
|
|
|
|
|
self,
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
fallback_stage: str,
|
|
|
|
|
|
llm: Any,
|
2026-03-27 16:01:28 +08:00
|
|
|
|
*,
|
|
|
|
|
|
segment_id: str | None = None,
|
feat(i18n): persist language preference and thread through chat, memoir, TTS
- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS
only; expose on auth and profile APIs
- Lite English prompts for chat and memoir; localized stage labels and agent
names (Life Echo / 岁月知己)
- Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking
- WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs
for tts_this_turn and TTS decisions; on-demand TTS logging
- Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes
- Tests for migration, prompts, pipeline, router tts_this_turn, reply segments
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-11 16:16:49 +08:00
|
|
|
|
language: str = "zh",
|
2026-03-31 23:55:26 +08:00
|
|
|
|
) -> ChapterClassifyResult:
|
2026-03-19 10:38:11 +08:00
|
|
|
|
"""
|
|
|
|
|
|
分类到 8 个章节类别之一。
|
2026-03-31 23:55:26 +08:00
|
|
|
|
LLM 返回 none 或启发式为零散档案时,``category`` 为 ``summary``(仍可走回忆录流水线;
|
|
|
|
|
|
``llm_said_none`` 仅在 LLM 明确返回 none 时为 True,供空转抑制判断)。
|
2026-03-19 10:38:11 +08:00
|
|
|
|
llm 需支持 .invoke(prompt) 同步调用。
|
|
|
|
|
|
"""
|
2026-03-23 13:54:41 +08:00
|
|
|
|
if _looks_like_fragment_only(text):
|
2026-03-27 16:01:28 +08:00
|
|
|
|
logger.info(
|
|
|
|
|
|
"event=chapter_classification_summary_fallback reason=fragment_heuristic "
|
|
|
|
|
|
"segment_id={} text_len={} category={}",
|
|
|
|
|
|
segment_id or "",
|
2026-03-23 13:54:41 +08:00
|
|
|
|
len(text or ""),
|
2026-03-27 16:01:28 +08:00
|
|
|
|
_SUMMARY_FALLBACK_CATEGORY,
|
2026-03-23 13:54:41 +08:00
|
|
|
|
)
|
2026-03-31 23:55:26 +08:00
|
|
|
|
return ChapterClassifyResult(
|
|
|
|
|
|
category=_SUMMARY_FALLBACK_CATEGORY,
|
|
|
|
|
|
llm_said_none=False,
|
|
|
|
|
|
)
|
2026-03-23 13:54:41 +08:00
|
|
|
|
|
2026-03-19 10:38:11 +08:00
|
|
|
|
if llm:
|
|
|
|
|
|
try:
|
feat(i18n): persist language preference and thread through chat, memoir, TTS
- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS
only; expose on auth and profile APIs
- Lite English prompts for chat and memoir; localized stage labels and agent
names (Life Echo / 岁月知己)
- Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking
- WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs
for tts_this_turn and TTS decisions; on-demand TTS logging
- Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes
- Tests for migration, prompts, pipeline, router tts_this_turn, reply segments
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-11 16:16:49 +08:00
|
|
|
|
prompt = get_chapter_classification_json_prompt(text, language=language)
|
2026-04-03 13:34:27 +08:00
|
|
|
|
out = llm_json_call(
|
2026-03-26 12:13:36 +08:00
|
|
|
|
llm,
|
|
|
|
|
|
prompt,
|
2026-04-03 13:34:27 +08:00
|
|
|
|
ClassificationOutput,
|
2026-05-22 13:44:50 +08:00
|
|
|
|
max_tokens=memoir.classification_max_tokens,
|
2026-03-26 12:13:36 +08:00
|
|
|
|
agent="ClassificationAgent.classify",
|
|
|
|
|
|
)
|
2026-04-03 13:34:27 +08:00
|
|
|
|
category = _normalize_llm_category(out.category)
|
2026-03-19 10:38:11 +08:00
|
|
|
|
if category == "none":
|
2026-03-27 16:01:28 +08:00
|
|
|
|
logger.info(
|
|
|
|
|
|
"event=chapter_classification_summary_fallback reason=llm_none "
|
|
|
|
|
|
"segment_id={} text_len={} category={}",
|
|
|
|
|
|
segment_id or "",
|
2026-03-22 16:45:57 +08:00
|
|
|
|
len(text or ""),
|
2026-03-27 16:01:28 +08:00
|
|
|
|
_SUMMARY_FALLBACK_CATEGORY,
|
2026-03-19 14:36:14 +08:00
|
|
|
|
)
|
2026-03-31 23:55:26 +08:00
|
|
|
|
return ChapterClassifyResult(
|
|
|
|
|
|
category=_SUMMARY_FALLBACK_CATEGORY,
|
|
|
|
|
|
llm_said_none=True,
|
|
|
|
|
|
)
|
2026-03-19 10:38:11 +08:00
|
|
|
|
if category in CHAPTER_CATEGORIES:
|
2026-03-31 23:55:26 +08:00
|
|
|
|
return ChapterClassifyResult(category=category, llm_said_none=False)
|
2026-04-08 15:37:09 +08:00
|
|
|
|
except (LLMCallError, ValidationError, ValueError, KeyError) as e:
|
2026-03-26 12:13:36 +08:00
|
|
|
|
logger.warning("ClassificationAgent LLM 章节分类失败: {}", e)
|
2026-03-19 10:38:11 +08:00
|
|
|
|
|
|
|
|
|
|
stage = _detect_stage(text, fallback_stage)
|
2026-04-02 12:00:00 +08:00
|
|
|
|
cat = STAGE_TO_DEFAULT_CATEGORY.get(
|
2026-03-19 10:38:11 +08:00
|
|
|
|
stage,
|
2026-04-02 12:00:00 +08:00
|
|
|
|
STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"),
|
2026-03-19 10:38:11 +08:00
|
|
|
|
)
|
2026-03-31 23:55:26 +08:00
|
|
|
|
return ChapterClassifyResult(category=cat, llm_said_none=False)
|