"""Transcript chunker — split raw text into retrieval-ready chunks.""" def chunk_transcript( text: str, *, max_chars: int = 800, overlap_chars: int = 100 ) -> list[str]: """ Split transcript text into overlapping chunks. Uses character count as proxy for tokens (~4 chars/token for Chinese). """ if not text or not text.strip(): return [] text = text.strip() if len(text) <= max_chars: return [text] if text else [] if max_chars <= 0: raise ValueError("max_chars must be positive") if overlap_chars < 0: raise ValueError("overlap_chars cannot be negative") overlap = min(overlap_chars, max_chars - 1) chunks: list[str] = [] start = 0 while start < len(text): end = start + max_chars chunk = text[start:end] # 尽量在句末切分 if end < len(text): for sep in ["。", "!", "?", "\n", ";", ".", "!", "?"]: last_sep = chunk.rfind(sep) if last_sep > max_chars // 2: chunk = chunk[: last_sep + 1] end = start + len(chunk) break if chunk.strip(): chunks.append(chunk.strip()) if not chunk: start += max_chars - overlap continue next_start = end - overlap if next_start <= start: next_start = start + len(chunk) start = next_start return chunks