"""Transcript chunker — split raw text into retrieval-ready chunks."""
import re
def chunk_transcript(
text: str, *, max_chars: int = 800, overlap_chars: int = 100
) -> list[str]:
"""
Split transcript text into overlapping chunks.
Uses character count as proxy for tokens (~4 chars/token for Chinese).
if not text or not text.strip():
return []
text = text.strip()
if len(text) <= max_chars:
return [text] if text else []
chunks: list[str] = []
start = 0
step = max_chars - overlap_chars
while start < len(text):
end = start + max_chars
chunk = text[start:end]
# 尽量在句末切分
if end < len(text):
for sep in ["。", "!", "?", "\n", ";", ".", "!", "?"]:
last_sep = chunk.rfind(sep)
if last_sep > max_chars // 2:
chunk = chunk[: last_sep + 1]
end = start + len(chunk)
break
if chunk.strip():
chunks.append(chunk.strip())
start += len(chunk) if chunk else step
return chunks