feat(api): 访谈人格/回复长度策略、口述归一、背景语气与输入净稿全链路

Chat 访谈 - 新增 persona 系统（default / warm_listener / curious_guide）与 background_voice 语气层 - 回复长度由 compute_reply_plan 统一决策（brief / standard / expanded），融合信息密度启发式 - 输入净稿（input_normalize）：编排层可选 rules/llm 归一用户口语后再喂模型与记忆检索 - 记忆证据注入：按用户话检索 memory evidence 并注入 prompt Memoir 回忆录 - 口述归一（oral_normalize）：segment 原文保留，story 管线取派生净稿作叙事输入 - segment 入队批次门闸：累计字数 + 最长等待秒数，减少零碎提交 - fidelity_check / prompts / narrative_agent 微调 - Alembic 0005：清理跨章节 story 外键 Infra - Dockerfile 加入 ffmpeg - pyproject.toml 新增依赖并同步 uv.lock - .env.example / .env.production 补全新配置项 Tests - 新增 test_background_voice、test_chat_input_normalize、test_experience_regressions - 扩展 test_interview_prompts、test_interview_reply_length、test_story_route_oral_invariant Made-with: Cursor
2026-03-31 23:55:26 +08:00
parent 42ae2a5e91
commit 69a673e6c6
44 changed files with 2998 additions and 259 deletions
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -2,6 +2,7 @@

 import asyncio
 import base64
+import io
 import time
 import uuid
 from dataclasses import dataclass, field
@@ -358,6 +359,58 @@ async def _delayed_listening_feedback(
    await _send_segment_transition_feedback(conversation_id, 0)


+# ── 长音频切片转写 ────────────────────────────────────────────
+
+MAX_ASR_CHUNK_MS = 55_000
+
+
+def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]:
+    """用 pydub 将长音频按 ≤55 s 切片，每片导出为 16 kHz mono WAV（腾讯 ASR 3 MB 限制内）。"""
+    from pydub import AudioSegment as PydubSegment
+
+    audio = PydubSegment.from_file(io.BytesIO(audio_bytes), format=fmt)
+    duration_ms = len(audio)
+
+    if duration_ms <= MAX_ASR_CHUNK_MS:
+        return [audio_bytes]
+
+    mono_16k = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
+    chunks: list[bytes] = []
+    for start in range(0, duration_ms, MAX_ASR_CHUNK_MS):
+        chunk = mono_16k[start : start + MAX_ASR_CHUNK_MS]
+        buf = io.BytesIO()
+        chunk.export(buf, format="wav")
+        chunks.append(buf.getvalue())
+    return chunks
+
+
+async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str:
+    """超过 55 s 的音频自动切片后并行 ASR；短音频直接转写。"""
+    asr = get_asr_provider()
+    try:
+        chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt)
+    except Exception as exc:
+        logger.warning("pydub 切片失败 ({}), 回退到直接转写", exc)
+        return await asr.transcribe(audio_bytes, format=fmt)
+
+    if len(chunks) <= 1:
+        return await asr.transcribe(audio_bytes, format=fmt)
+
+    logger.info("长音频切片: {} 段", len(chunks))
+    results = await asyncio.gather(
+        *[asr.transcribe(c, format="wav") for c in chunks],
+        return_exceptions=True,
+    )
+    texts: list[str] = []
+    for i, r in enumerate(results):
+        if isinstance(r, BaseException):
+            logger.warning("切片 {} 转写异常: {}", i, r)
+            continue
+        if r and not _is_transcribe_failure(r):
+            texts.append(r)
+    return "".join(texts)
+
+
 # ── 分段语音异步处理 ────────────────────────────────────────────


@@ -439,9 +492,7 @@ async def process_audio_segment(
                    conversation_id,
                    segment_index,
                )
-            transcript_text = await get_asr_provider().transcribe(
-                audio_bytes, format="m4a"
-            )
+            transcript_text = await _transcribe_long_audio(audio_bytes, fmt="m4a")
            await manager.send_message(
                conversation_id,
                {
@@ -513,7 +564,11 @@ async def process_audio_segment(
                user_message_timestamp = _mark_conversation_active(conversation)
                await db.commit()
                await db.refresh(segment)
-                await background_runner.queue_message(conversation.user_id, segment.id)
+                await background_runner.queue_message(
+                    conversation.user_id,
+                    segment.id,
+                    text_char_count=len((transcript_text or "").strip()),
+                )

            ready_segments: List[Tuple[int, str, Segment]] = []
            async with state.lock:
--- a/api/app/features/conversation/ws/router.py
+++ b/api/app/features/conversation/ws/router.py
@@ -11,6 +11,7 @@ from datetime import datetime, timezone
 from fastapi import WebSocket, WebSocketDisconnect, status
 from starlette.websockets import WebSocketState

+from app.agents.chat.background_voice import infer_background_voice
 from app.agents.chat.prompts_profile import format_user_profile_context
 from app.core.db import AsyncSessionLocal
 from app.core.dependencies import get_asr_provider
@@ -201,6 +202,9 @@ async def websocket_endpoint(
                                conversation_id=conversation_id,
                                memoir_state=state,
                                user_profile_context=user_profile_context,
+                                background_voice=infer_background_voice(
+                                    user.occupation
+                                ),
                            )
                        )
                        ai_msg_id = await ConversationHistoryStore(
@@ -300,7 +304,9 @@ async def websocket_endpoint(
                            await db.commit()
                            await db.refresh(segment)
                            await background_runner.queue_message(
-                                conversation.user_id, segment.id
+                                conversation.user_id,
+                                segment.id,
+                                text_char_count=len(text_message.strip()),
                            )

                            await process_user_message(
@@ -563,7 +569,9 @@ async def websocket_endpoint(
                                await db.commit()
                                await db.refresh(segment)
                                await background_runner.queue_message(
-                                    conversation.user_id, segment.id
+                                    conversation.user_id,
+                                    segment.id,
+                                    text_char_count=len((asr_text or "").strip()),
                                )

                                if asr_text and not asr_text.startswith("转写失败"):