64 lines
1.9 KiB
Python
64 lines
1.9 KiB
Python
|
|
"""章节物化前对 story 正文的受限清洗:禁止表格、可选剥离与标题元数据重复的首行 heading。"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import re
|
||
|
|
|
||
|
|
|
||
|
|
def _is_table_row(line: str) -> bool:
|
||
|
|
s = line.strip()
|
||
|
|
if not s.startswith("|"):
|
||
|
|
return False
|
||
|
|
return s.count("|") >= 2
|
||
|
|
|
||
|
|
|
||
|
|
def strip_markdown_tables(text: str) -> str:
|
||
|
|
"""移除 GFM 管道表格块(连续以 | 开头的行)。"""
|
||
|
|
if not text or not str(text).strip():
|
||
|
|
return ""
|
||
|
|
lines = str(text).splitlines()
|
||
|
|
out: list[str] = []
|
||
|
|
i = 0
|
||
|
|
while i < len(lines):
|
||
|
|
if _is_table_row(lines[i]):
|
||
|
|
while i < len(lines) and _is_table_row(lines[i]):
|
||
|
|
i += 1
|
||
|
|
continue
|
||
|
|
out.append(lines[i])
|
||
|
|
i += 1
|
||
|
|
return "\n".join(out).strip()
|
||
|
|
|
||
|
|
|
||
|
|
_HEADING_LINE_RE = re.compile(r"^#{1,6}\s+(.+?)\s*$")
|
||
|
|
|
||
|
|
|
||
|
|
def _normalize_title_key(s: str) -> str:
|
||
|
|
return "".join((s or "").split()).casefold()
|
||
|
|
|
||
|
|
|
||
|
|
def strip_leading_heading_if_matches_title(body: str, story_title: str) -> str:
|
||
|
|
"""若首行为 markdown 标题且与 story 标题(规范化后)一致,则移除该行。"""
|
||
|
|
if not body or not str(body).strip():
|
||
|
|
return body or ""
|
||
|
|
st_key = _normalize_title_key(story_title or "")
|
||
|
|
if not st_key:
|
||
|
|
return body
|
||
|
|
lines = str(body).splitlines()
|
||
|
|
if not lines:
|
||
|
|
return body
|
||
|
|
m = _HEADING_LINE_RE.match(lines[0].strip())
|
||
|
|
if not m:
|
||
|
|
return body
|
||
|
|
heading_key = _normalize_title_key(m.group(1))
|
||
|
|
if heading_key != st_key:
|
||
|
|
return body
|
||
|
|
rest = "\n".join(lines[1:])
|
||
|
|
return rest.lstrip("\n")
|
||
|
|
|
||
|
|
|
||
|
|
def sanitize_story_for_chapter_compose(body: str, story_title: str) -> str:
|
||
|
|
"""物化章节前:去表格、去与元数据重复的首行标题。"""
|
||
|
|
t = strip_markdown_tables(body or "")
|
||
|
|
t = strip_leading_heading_if_matches_title(t, story_title)
|
||
|
|
return (t or "").strip()
|