"""章节阅读片段物化与「可读字数」门槛(单一事实源)。 **字数阈值** ``MIN_STORY_CHARS_IN_CHAPTER``(当前 300):对 Markdown 去图片/链接噪声后 用 ``story_plain_text_char_count`` 估算字符数,用于: - **单篇故事**:是否写入 ``reading_segments_json``、是否参与 ``chapter_markdown_compose`` 拼接; - **章节**:``chapter_meets_minimum_display`` / ``chapter_body_meets_minimum_for_display`` 是否对 用户展示(列表/详情/PDF 见 ``MemoirService``)。 **物化**:``build_reading_segments_snapshot`` 与 canonical 同路径写入 ``reading_segments_json``(无签名 URL)。 **API**:``hydrate_reading_segments_from_snapshot`` 解析快照(含签名 URL);旧快照亦按当前阈值过滤。 ``resolve_reading_segments_for_chapter_detail`` 仅读已物化快照。 其它引用:``repo.append_chapter_compose``、``helpers.chapter_to_dict``(经 ``resolve_reading_segments…``)、 ``chapter_markdown_compose``(故事拼接)。 """ from __future__ import annotations import re from typing import Any from app.features.memoir.asset_resolver import ( collect_asset_ids_from_markdown, resolve_asset_refs_in_markdown, strip_asset_image_refs_from_markdown, strip_image_placeholders, ) from app.features.memoir.markdown_sanitize import sanitize_story_for_chapter_compose from app.features.memoir.models import Chapter # 故事收录章节、章节对读者展示:共用最小可读字数(与 story_plain_text_char_count 一致) MIN_STORY_CHARS_IN_CHAPTER = 300 _WS_COLLAPSE = re.compile(r"\s+") def story_plain_text_char_count(markdown: str) -> int: """估算 Markdown 正文可读字符数(中英按字计),用于故事/章节字数门槛。""" if not markdown or not str(markdown).strip(): return 0 t = strip_image_placeholders(markdown) t = strip_asset_image_refs_from_markdown(t) t = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", t) t = re.sub(r"!\[([^\]]*)\]\([^)]+\)", "", t) t = re.sub(r"`+([^`]+)`+", r"\1", t) t = re.sub(r"^#{1,6}\s+", "", t, flags=re.MULTILINE) # 剩余强调符等不计入「字数」 t = re.sub(r"[*_#`]", "", t) t = _WS_COLLAPSE.sub("", t) return len(t) def story_meets_minimum_chapter_length(markdown: str) -> bool: """单篇故事正文是否达到收录章节的阈值(物化快照、hydrate 过滤)。""" return story_plain_text_char_count(markdown) >= MIN_STORY_CHARS_IN_CHAPTER def chapter_body_meets_minimum_for_display(canonical_markdown: str) -> bool: """章节 canonical 是否达到对读者展示的最小可读字数(与单篇故事阈值一致)。""" return ( story_plain_text_char_count(canonical_markdown or "") >= MIN_STORY_CHARS_IN_CHAPTER ) def chapter_meets_minimum_display( ch: Any, *, canonical_markdown_override: str | None = None ) -> bool: """基于章节 canonical(或读路径临时物化串)判断是否可对读者展示。""" if canonical_markdown_override is not None: return chapter_body_meets_minimum_for_display(canonical_markdown_override) md = getattr(ch, "canonical_markdown", None) or "" return chapter_body_meets_minimum_for_display(str(md)) def _primary_story_intent_asset_id(story: Any) -> str | None: for it in getattr(story, "image_intents", None) or []: if getattr(it, "intent_role", None) == "primary": aid = getattr(it, "asset_id", None) return str(aid) if aid else None return None def _cover_intent_snapshot_from_story(story: Any) -> dict | None: """primary intent 元数据(无 url),供 JSON 持久化。""" intents = getattr(story, "image_intents", None) or [] primary = None for it in intents: if getattr(it, "intent_role", None) == "primary": primary = it break if not primary: return None aid = getattr(primary, "asset_id", None) if not aid: return None status = getattr(primary, "status", None) or "pending" return { "asset_id": str(aid), "status": status, "description": getattr(primary, "caption", None) or "故事配图", "prompt": getattr(primary, "prompt_brief", None), "style": getattr(primary, "style_profile", None), "error": getattr(primary, "error", None), "created_at": primary.created_at.isoformat() if primary.created_at else None, "updated_at": primary.updated_at.isoformat() if primary.updated_at else None, } def _cover_dict_from_snapshot_row( snap: dict[str, Any], asset_url_map: dict[str, str] ) -> dict: aid = snap.get("asset_id") url = asset_url_map.get(str(aid)) if aid else None return { "placeholder": "", "description": snap.get("description") or "故事配图", "index": 0, "status": snap.get("status") or "pending", "prompt": snap.get("prompt"), "url": url, "storage_key": None, "provider": None, "style": snap.get("style"), "size": None, "error": snap.get("error"), "retryable": None, "created_at": snap.get("created_at"), "updated_at": snap.get("updated_at"), } def build_reading_segments_snapshot(ch: Chapter) -> list[dict[str, Any]]: """ 物化阅读片段快照:body 保留 asset://;cover 仅存 intent 元数据(正文已含同 asset 则省略)。 与 append_chapter_compose_version 同路径写入。 """ links = sorted( list(getattr(ch, "story_links", None) or []), key=lambda x: getattr(x, "order_index", 0), ) out: list[dict[str, Any]] = [] for link in links: st = getattr(link, "story", None) if st is None: continue title = (getattr(st, "title", None) or "").strip() raw = (getattr(st, "canonical_markdown", None) or "").strip() body = sanitize_story_for_chapter_compose(raw, title) if not body: continue if not story_meets_minimum_chapter_length(body): continue primary_aid = _primary_story_intent_asset_id(st) inline_ids = set(collect_asset_ids_from_markdown(body)) cover: dict | None = None if primary_aid and primary_aid not in inline_ids: cover = _cover_intent_snapshot_from_story(st) out.append( { "story_id": st.id, "body_markdown": body, "cover_asset": cover, } ) return out def hydrate_reading_segments_from_snapshot( ch: Chapter, asset_url_map: dict[str, str] | None = None, ) -> list[dict[str, Any]]: """将持久化快照解析为 API 形态(签名 URL)。""" from app.features.memoir import helpers as h asset_url_map = asset_url_map or {} resolve = lambda aid: asset_url_map.get(aid) # noqa: E731 rows = getattr(ch, "reading_segments_json", None) or [] out: list[dict[str, Any]] = [] for row in rows: raw_body = row.get("body_markdown") or "" # 与物化时一致;旧库快照亦按当前阈值过滤 if not story_meets_minimum_chapter_length(raw_body): continue body = resolve_asset_refs_in_markdown(raw_body, resolve) ci = row.get("cover_asset") if ci: img_raw = _cover_dict_from_snapshot_row(ci, asset_url_map) img_norm = h.first_normalized_image_for_api(img_raw) else: img_norm = None out.append( { "story_id": row["story_id"], "body_markdown": body, "cover_asset": img_norm, } ) return out def resolve_reading_segments_for_chapter_detail( ch: Chapter, asset_url_map: dict[str, str] | None = None, ) -> list[dict[str, Any]]: """章节详情:仅读取已物化快照。""" asset_url_map = asset_url_map or {} if getattr(ch, "reading_segments_json", None) is None: return [] return hydrate_reading_segments_from_snapshot(ch, asset_url_map=asset_url_map)