Files
life-echo/api/app/features/memoir/reading_segment_materialize.py
Kevin 53d9e003af feat(api): 叙事 prompt、职业上下文、读路径章节、WS 解耦与错误脱敏
- 回忆录:事实边界补充允许清单;传记文体示例与 JSON 叙事要求对齐
- default 职业提示 occupation_context;cadre/military 退休语境
- GET 章节读路径零写入,prepare_chapter_read_view + markdown_for_response
- 文本归一抽到 core/text_normalize;移除弃用 reply 策略与 recompose_chapters_for_story
- ConversationService:WS 连接/用户段落/结束对话;对外错误固定文案
- 测试:HTTP 脱敏契约、章节读视图、occupation 与 background_voice
2026-04-01 11:55:52 +08:00

213 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""章节阅读片段物化与「可读字数」门槛(单一事实源)。
**字数阈值** ``MIN_STORY_CHARS_IN_CHAPTER``(当前 300对 Markdown 去图片/链接噪声后
用 ``story_plain_text_char_count`` 估算字符数,用于:
- **单篇故事**:是否写入 ``reading_segments_json``、是否参与 ``chapter_markdown_compose`` 拼接;
- **章节**``chapter_meets_minimum_display`` / ``chapter_body_meets_minimum_for_display`` 是否对
用户展示(列表/详情/PDF 见 ``MemoirService``)。
**物化**``build_reading_segments_snapshot`` 与 canonical 同路径写入 ``reading_segments_json``(无签名 URL
**API**``hydrate_reading_segments_from_snapshot`` 解析快照(含签名 URL旧快照亦按当前阈值过滤。
``resolve_reading_segments_for_chapter_detail`` 仅读已物化快照。
其它引用:``repo.append_chapter_compose``、``helpers.chapter_to_dict``(经 ``resolve_reading_segments…``)、
``chapter_markdown_compose``(故事拼接)。
"""
from __future__ import annotations
import re
from typing import Any
from app.features.memoir.asset_resolver import (
collect_asset_ids_from_markdown,
resolve_asset_refs_in_markdown,
strip_asset_image_refs_from_markdown,
strip_image_placeholders,
)
from app.features.memoir.markdown_sanitize import sanitize_story_for_chapter_compose
from app.features.memoir.models import Chapter
# 故事收录章节、章节对读者展示:共用最小可读字数(与 story_plain_text_char_count 一致)
MIN_STORY_CHARS_IN_CHAPTER = 300
_WS_COLLAPSE = re.compile(r"\s+")
def story_plain_text_char_count(markdown: str) -> int:
"""估算 Markdown 正文可读字符数(中英按字计),用于故事/章节字数门槛。"""
if not markdown or not str(markdown).strip():
return 0
t = strip_image_placeholders(markdown)
t = strip_asset_image_refs_from_markdown(t)
t = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", t)
t = re.sub(r"!\[([^\]]*)\]\([^)]+\)", "", t)
t = re.sub(r"`+([^`]+)`+", r"\1", t)
t = re.sub(r"^#{1,6}\s+", "", t, flags=re.MULTILINE)
# 剩余强调符等不计入「字数」
t = re.sub(r"[*_#`]", "", t)
t = _WS_COLLAPSE.sub("", t)
return len(t)
def story_meets_minimum_chapter_length(markdown: str) -> bool:
"""单篇故事正文是否达到收录章节的阈值物化快照、hydrate 过滤)。"""
return story_plain_text_char_count(markdown) >= MIN_STORY_CHARS_IN_CHAPTER
def chapter_body_meets_minimum_for_display(canonical_markdown: str) -> bool:
"""章节 canonical 是否达到对读者展示的最小可读字数(与单篇故事阈值一致)。"""
return (
story_plain_text_char_count(canonical_markdown or "")
>= MIN_STORY_CHARS_IN_CHAPTER
)
def chapter_meets_minimum_display(
ch: Any, *, canonical_markdown_override: str | None = None
) -> bool:
"""基于章节 canonical或读路径临时物化串判断是否可对读者展示。"""
if canonical_markdown_override is not None:
return chapter_body_meets_minimum_for_display(canonical_markdown_override)
md = getattr(ch, "canonical_markdown", None) or ""
return chapter_body_meets_minimum_for_display(str(md))
def _primary_story_intent_asset_id(story: Any) -> str | None:
for it in getattr(story, "image_intents", None) or []:
if getattr(it, "intent_role", None) == "primary":
aid = getattr(it, "asset_id", None)
return str(aid) if aid else None
return None
def _cover_intent_snapshot_from_story(story: Any) -> dict | None:
"""primary intent 元数据(无 url供 JSON 持久化。"""
intents = getattr(story, "image_intents", None) or []
primary = None
for it in intents:
if getattr(it, "intent_role", None) == "primary":
primary = it
break
if not primary:
return None
aid = getattr(primary, "asset_id", None)
if not aid:
return None
status = getattr(primary, "status", None) or "pending"
return {
"asset_id": str(aid),
"status": status,
"description": getattr(primary, "caption", None) or "故事配图",
"prompt": getattr(primary, "prompt_brief", None),
"style": getattr(primary, "style_profile", None),
"error": getattr(primary, "error", None),
"created_at": primary.created_at.isoformat() if primary.created_at else None,
"updated_at": primary.updated_at.isoformat() if primary.updated_at else None,
}
def _cover_dict_from_snapshot_row(
snap: dict[str, Any], asset_url_map: dict[str, str]
) -> dict:
aid = snap.get("asset_id")
url = asset_url_map.get(str(aid)) if aid else None
return {
"placeholder": "",
"description": snap.get("description") or "故事配图",
"index": 0,
"status": snap.get("status") or "pending",
"prompt": snap.get("prompt"),
"url": url,
"storage_key": None,
"provider": None,
"style": snap.get("style"),
"size": None,
"error": snap.get("error"),
"retryable": None,
"created_at": snap.get("created_at"),
"updated_at": snap.get("updated_at"),
}
def build_reading_segments_snapshot(ch: Chapter) -> list[dict[str, Any]]:
"""
物化阅读片段快照body 保留 asset://cover 仅存 intent 元数据(正文已含同 asset 则省略)。
与 append_chapter_compose_version 同路径写入。
"""
links = sorted(
list(getattr(ch, "story_links", None) or []),
key=lambda x: getattr(x, "order_index", 0),
)
out: list[dict[str, Any]] = []
for link in links:
st = getattr(link, "story", None)
if st is None:
continue
title = (getattr(st, "title", None) or "").strip()
raw = (getattr(st, "canonical_markdown", None) or "").strip()
body = sanitize_story_for_chapter_compose(raw, title)
if not body:
continue
if not story_meets_minimum_chapter_length(body):
continue
primary_aid = _primary_story_intent_asset_id(st)
inline_ids = set(collect_asset_ids_from_markdown(body))
cover: dict | None = None
if primary_aid and primary_aid not in inline_ids:
cover = _cover_intent_snapshot_from_story(st)
out.append(
{
"story_id": st.id,
"body_markdown": body,
"cover_asset": cover,
}
)
return out
def hydrate_reading_segments_from_snapshot(
ch: Chapter,
asset_url_map: dict[str, str] | None = None,
) -> list[dict[str, Any]]:
"""将持久化快照解析为 API 形态(签名 URL"""
from app.features.memoir import helpers as h
asset_url_map = asset_url_map or {}
resolve = lambda aid: asset_url_map.get(aid) # noqa: E731
rows = getattr(ch, "reading_segments_json", None) or []
out: list[dict[str, Any]] = []
for row in rows:
raw_body = row.get("body_markdown") or ""
# 与物化时一致;旧库快照亦按当前阈值过滤
if not story_meets_minimum_chapter_length(raw_body):
continue
body = resolve_asset_refs_in_markdown(raw_body, resolve)
ci = row.get("cover_asset")
if ci:
img_raw = _cover_dict_from_snapshot_row(ci, asset_url_map)
img_norm = h.first_normalized_image_for_api(img_raw)
else:
img_norm = None
out.append(
{
"story_id": row["story_id"],
"body_markdown": body,
"cover_asset": img_norm,
}
)
return out
def resolve_reading_segments_for_chapter_detail(
ch: Chapter,
asset_url_map: dict[str, str] | None = None,
) -> list[dict[str, Any]]:
"""章节详情:仅读取已物化快照。"""
asset_url_map = asset_url_map or {}
if getattr(ch, "reading_segments_json", None) is None:
return []
return hydrate_reading_segments_from_snapshot(ch, asset_url_map=asset_url_map)