2026-03-20 10:30:07 +08:00
|
|
|
|
"""Transcript chunker — split raw text into retrieval-ready chunks."""
|
|
|
|
|
|
|
2026-03-18 17:18:23 +08:00
|
|
|
|
|
2026-03-19 14:36:14 +08:00
|
|
|
|
def chunk_transcript(
|
2026-03-20 10:30:07 +08:00
|
|
|
|
text: str, *, max_chars: int = 800, overlap_chars: int = 100
|
2026-03-19 14:36:14 +08:00
|
|
|
|
) -> list[str]:
|
2026-03-20 10:30:07 +08:00
|
|
|
|
"""
|
|
|
|
|
|
Split transcript text into overlapping chunks.
|
|
|
|
|
|
Uses character count as proxy for tokens (~4 chars/token for Chinese).
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not text or not text.strip():
|
|
|
|
|
|
return []
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
if len(text) <= max_chars:
|
|
|
|
|
|
return [text] if text else []
|
2026-04-30 16:22:55 +08:00
|
|
|
|
if max_chars <= 0:
|
|
|
|
|
|
raise ValueError("max_chars must be positive")
|
|
|
|
|
|
if overlap_chars < 0:
|
|
|
|
|
|
raise ValueError("overlap_chars cannot be negative")
|
|
|
|
|
|
overlap = min(overlap_chars, max_chars - 1)
|
2026-03-20 10:30:07 +08:00
|
|
|
|
|
|
|
|
|
|
chunks: list[str] = []
|
|
|
|
|
|
start = 0
|
|
|
|
|
|
|
|
|
|
|
|
while start < len(text):
|
|
|
|
|
|
end = start + max_chars
|
|
|
|
|
|
chunk = text[start:end]
|
|
|
|
|
|
# 尽量在句末切分
|
|
|
|
|
|
if end < len(text):
|
|
|
|
|
|
for sep in ["。", "!", "?", "\n", ";", ".", "!", "?"]:
|
|
|
|
|
|
last_sep = chunk.rfind(sep)
|
|
|
|
|
|
if last_sep > max_chars // 2:
|
|
|
|
|
|
chunk = chunk[: last_sep + 1]
|
|
|
|
|
|
end = start + len(chunk)
|
|
|
|
|
|
break
|
|
|
|
|
|
if chunk.strip():
|
|
|
|
|
|
chunks.append(chunk.strip())
|
2026-04-30 16:22:55 +08:00
|
|
|
|
if not chunk:
|
|
|
|
|
|
start += max_chars - overlap
|
|
|
|
|
|
continue
|
|
|
|
|
|
next_start = end - overlap
|
|
|
|
|
|
if next_start <= start:
|
|
|
|
|
|
next_start = start + len(chunk)
|
|
|
|
|
|
start = next_start
|
2026-03-20 10:30:07 +08:00
|
|
|
|
|
|
|
|
|
|
return chunks
|