Files
life-echo/api/app/features/memoir/markdown_sanitize.py
Kevin 8af37e5e8e 修复:CI 部署环境与 ref 错配、迁移碎片化、图片意图 source_span、章节物化脏版式、会话历史与本地语音不一致
新增:TTS 上传 COS 与分片、章节 reading_segments 物化与快照、markdown 清洗、会话消息 repository、语音 store 重构与相关测试
2026-03-20 16:43:02 +08:00

64 lines
1.9 KiB
Python

"""章节物化前对 story 正文的受限清洗:禁止表格、可选剥离与标题元数据重复的首行 heading。"""
from __future__ import annotations
import re
def _is_table_row(line: str) -> bool:
s = line.strip()
if not s.startswith("|"):
return False
return s.count("|") >= 2
def strip_markdown_tables(text: str) -> str:
"""移除 GFM 管道表格块(连续以 | 开头的行)。"""
if not text or not str(text).strip():
return ""
lines = str(text).splitlines()
out: list[str] = []
i = 0
while i < len(lines):
if _is_table_row(lines[i]):
while i < len(lines) and _is_table_row(lines[i]):
i += 1
continue
out.append(lines[i])
i += 1
return "\n".join(out).strip()
_HEADING_LINE_RE = re.compile(r"^#{1,6}\s+(.+?)\s*$")
def _normalize_title_key(s: str) -> str:
return "".join((s or "").split()).casefold()
def strip_leading_heading_if_matches_title(body: str, story_title: str) -> str:
"""若首行为 markdown 标题且与 story 标题(规范化后)一致,则移除该行。"""
if not body or not str(body).strip():
return body or ""
st_key = _normalize_title_key(story_title or "")
if not st_key:
return body
lines = str(body).splitlines()
if not lines:
return body
m = _HEADING_LINE_RE.match(lines[0].strip())
if not m:
return body
heading_key = _normalize_title_key(m.group(1))
if heading_key != st_key:
return body
rest = "\n".join(lines[1:])
return rest.lstrip("\n")
def sanitize_story_for_chapter_compose(body: str, story_title: str) -> str:
"""物化章节前:去表格、去与元数据重复的首行标题。"""
t = strip_markdown_tables(body or "")
t = strip_leading_heading_if_matches_title(t, story_title)
return (t or "").strip()