修复:CI 部署环境与 ref 错配、迁移碎片化、图片意图 source_span、章节物化脏版式、会话历史与本地语音不一致
新增:TTS 上传 COS 与分片、章节 reading_segments 物化与快照、markdown 清洗、会话消息 repository、语音 store 重构与相关测试
This commit is contained in:
63
api/app/features/memoir/markdown_sanitize.py
Normal file
63
api/app/features/memoir/markdown_sanitize.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""章节物化前对 story 正文的受限清洗:禁止表格、可选剥离与标题元数据重复的首行 heading。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def _is_table_row(line: str) -> bool:
|
||||
s = line.strip()
|
||||
if not s.startswith("|"):
|
||||
return False
|
||||
return s.count("|") >= 2
|
||||
|
||||
|
||||
def strip_markdown_tables(text: str) -> str:
|
||||
"""移除 GFM 管道表格块(连续以 | 开头的行)。"""
|
||||
if not text or not str(text).strip():
|
||||
return ""
|
||||
lines = str(text).splitlines()
|
||||
out: list[str] = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if _is_table_row(lines[i]):
|
||||
while i < len(lines) and _is_table_row(lines[i]):
|
||||
i += 1
|
||||
continue
|
||||
out.append(lines[i])
|
||||
i += 1
|
||||
return "\n".join(out).strip()
|
||||
|
||||
|
||||
_HEADING_LINE_RE = re.compile(r"^#{1,6}\s+(.+?)\s*$")
|
||||
|
||||
|
||||
def _normalize_title_key(s: str) -> str:
|
||||
return "".join((s or "").split()).casefold()
|
||||
|
||||
|
||||
def strip_leading_heading_if_matches_title(body: str, story_title: str) -> str:
|
||||
"""若首行为 markdown 标题且与 story 标题(规范化后)一致,则移除该行。"""
|
||||
if not body or not str(body).strip():
|
||||
return body or ""
|
||||
st_key = _normalize_title_key(story_title or "")
|
||||
if not st_key:
|
||||
return body
|
||||
lines = str(body).splitlines()
|
||||
if not lines:
|
||||
return body
|
||||
m = _HEADING_LINE_RE.match(lines[0].strip())
|
||||
if not m:
|
||||
return body
|
||||
heading_key = _normalize_title_key(m.group(1))
|
||||
if heading_key != st_key:
|
||||
return body
|
||||
rest = "\n".join(lines[1:])
|
||||
return rest.lstrip("\n")
|
||||
|
||||
|
||||
def sanitize_story_for_chapter_compose(body: str, story_title: str) -> str:
|
||||
"""物化章节前:去表格、去与元数据重复的首行标题。"""
|
||||
t = strip_markdown_tables(body or "")
|
||||
t = strip_leading_heading_if_matches_title(t, story_title)
|
||||
return (t or "").strip()
|
||||
Reference in New Issue
Block a user