"""章节物化前对 story 正文的受限清洗:禁止表格、可选剥离与标题元数据重复的首行 heading。""" from __future__ import annotations import re def _is_table_row(line: str) -> bool: s = line.strip() if not s.startswith("|"): return False return s.count("|") >= 2 def strip_markdown_tables(text: str) -> str: """移除 GFM 管道表格块(连续以 | 开头的行)。""" if not text or not str(text).strip(): return "" lines = str(text).splitlines() out: list[str] = [] i = 0 while i < len(lines): if _is_table_row(lines[i]): while i < len(lines) and _is_table_row(lines[i]): i += 1 continue out.append(lines[i]) i += 1 return "\n".join(out).strip() _HEADING_LINE_RE = re.compile(r"^#{1,6}\s+(.+?)\s*$") def _normalize_title_key(s: str) -> str: return "".join((s or "").split()).casefold() def strip_leading_heading_if_matches_title(body: str, story_title: str) -> str: """若首行为 markdown 标题且与 story 标题(规范化后)一致,则移除该行。""" if not body or not str(body).strip(): return body or "" st_key = _normalize_title_key(story_title or "") if not st_key: return body lines = str(body).splitlines() if not lines: return body m = _HEADING_LINE_RE.match(lines[0].strip()) if not m: return body heading_key = _normalize_title_key(m.group(1)) if heading_key != st_key: return body rest = "\n".join(lines[1:]) return rest.lstrip("\n") def sanitize_story_for_chapter_compose(body: str, story_title: str) -> str: """物化章节前:去表格、去与元数据重复的首行标题。""" t = strip_markdown_tables(body or "") t = strip_leading_heading_if_matches_title(t, story_title) return (t or "").strip()