""" asset:// 与正文占位符清理。 从正文移除 {{IMAGE:...}} / {{{{IMAGE:...}}}}(历史正文可能仍含此类标记)。 """ import re from typing import Callable _PLACEHOLDER_RE = re.compile( r"\{\{\{\{IMAGE:(.*?)\}\}\}\}|\{\{IMAGE:(.*?)\}\}", re.DOTALL, ) _ASSET_REF_RE = re.compile(r"!\[([^\]]*)\]\(asset://([a-zA-Z0-9_-]+)\)") _BLANK_RUN_RE = re.compile(r"\n{3,}") def strip_image_placeholders(text: str | None) -> str: """移除正文中的 IMAGE 占位符,保留其余 markdown。""" if not text: return "" return _PLACEHOLDER_RE.sub("", text).strip() def parse_asset_refs(markdown: str) -> list[tuple[int, int, str, str]]: refs = [] for m in _ASSET_REF_RE.finditer(markdown or ""): refs.append((m.start(), m.end(), m.group(1) or "", m.group(2) or "")) return refs def collect_asset_ids_from_markdown(markdown: str) -> list[str]: return [m.group(2) for m in _ASSET_REF_RE.finditer(markdown or "") if m.group(2)] def strip_asset_image_refs_from_markdown(markdown: str | None) -> str: """Remove all `![...](asset://...)` references; collapse blank lines. Used for story single-primary policy: new versions / backfill must not accumulate multiple inline asset images. """ if not markdown or not str(markdown).strip(): return "" text = _ASSET_REF_RE.sub("", markdown or "") text = _BLANK_RUN_RE.sub("\n\n", text) return text.strip() def collect_asset_ids_for_chapter(chapter) -> set[str]: """章节正文 canonical、收录的各 story 正文、cover_asset_id 中的 asset id。""" ids: set[str] = set() md = getattr(chapter, "canonical_markdown", None) or "" ids.update(collect_asset_ids_from_markdown(md)) cid = getattr(chapter, "cover_asset_id", None) if cid: ids.add(str(cid)) for link in getattr(chapter, "story_links", None) or []: st = getattr(link, "story", None) if st is None: continue smd = getattr(st, "canonical_markdown", None) or "" ids.update(collect_asset_ids_from_markdown(smd)) for intent in getattr(st, "image_intents", None) or []: if getattr(intent, "intent_role", None) == "primary": aid = getattr(intent, "asset_id", None) if aid: ids.add(str(aid)) return ids def collect_asset_ids_for_chapters(chapters: list) -> set[str]: combined: set[str] = set() for ch in chapters or []: combined |= collect_asset_ids_for_chapter(ch) return combined def split_markdown_by_asset_refs( markdown: str, resolve_asset: Callable[[str], str | None], ) -> list[dict]: blocks: list[dict] = [] refs = parse_asset_refs(markdown or "") if not refs: text = (markdown or "").strip() if text: blocks.append({"type": "text", "value": text}) return blocks pos = 0 for start, end, caption, asset_id in refs: if start > pos: text = markdown[pos:start].strip() if text: blocks.append({"type": "text", "value": text}) url = resolve_asset(asset_id) if asset_id else None if url: blocks.append({"type": "image", "url": url, "caption": caption}) pos = end if pos < len(markdown or ""): text = markdown[pos:].strip() if text: blocks.append({"type": "text", "value": text}) return blocks def resolve_asset_refs_in_markdown( markdown: str, resolve_asset: Callable[[str], str | None], ) -> str: if not markdown or not resolve_asset: return markdown or "" def repl(m): caption, asset_id = m.group(1) or "", m.group(2) or "" url = resolve_asset(asset_id) if asset_id else None if url: return f"![{caption}]({url})" return m.group(0) return _ASSET_REF_RE.sub(repl, markdown)