feat(api): use Tencent 16k_zh_large ASR and remove local Whisper

Standardize ASR on Tencent's dialect-capable engine across all environments, drop faster-whisper from dependencies and deployment images, and add an expo-sqlite iOS vendor sync plus pod install in prebuild to prevent native build failures after npm install. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-25 10:21:41 +08:00
parent 4f0a314656
commit 22d282dc01
23 changed files with 91 additions and 561 deletions
--- a/api/tests/test_default_toml_legacy_parity.py
+++ b/api/tests/test_default_toml_legacy_parity.py
@@ -16,8 +16,7 @@ def test_default_toml_matches_legacy_settings_defaults() -> None:

    assert cfg.story.image_min_body_chars == 400

-    assert cfg.asr.provider == "whisper"
-    assert cfg.asr.device == "auto"
-    assert cfg.asr.compute_type == "auto"
+    assert cfg.asr.provider == "tencent"
+    assert cfg.asr.engine_type == "16k_zh_large"

    assert cfg.misc.tencent_sms_template_param_count == 2
--- a/api/tests/test_infra_regressions.py
+++ b/api/tests/test_infra_regressions.py
@@ -86,7 +86,7 @@ async def test_tencent_asr_transcribe_uses_to_thread(
    monkeypatch.setitem(sys.modules, "tencentcloud.asr.v20190614", package_module)
    monkeypatch.setattr(asyncio, "to_thread", fake_to_thread)

-    provider = TencentASRProvider("sid", "skey")
+    provider = TencentASRProvider("sid", "skey", engine_type="16k_zh_large")
    client = FakeClient()
    monkeypatch.setattr(provider, "_get_client", lambda: client)

@@ -98,5 +98,6 @@ async def test_tencent_asr_transcribe_uses_to_thread(
    assert getattr(fn, "__self__", None) is client
    assert getattr(fn, "__name__", "") == "SentenceRecognition"
    request = args[0]
+    assert request.EngSerViceType == "16k_zh_large"
    assert request.VoiceFormat == "m4a"
    assert request.DataLen == len(b"fake-audio")
--- a/api/tests/test_whisper_local.py
+++ b/api/tests/test_whisper_local.py
@@ -1,61 +0,0 @@
-import asyncio
-import sys
-from types import SimpleNamespace
-
-import pytest
-
-from app.adapters.asr.whisper_local import (
-    WhisperASRProvider,
-    _looks_like_subtitle_hallucination,
-)
-
-
-def test_subtitle_watermark_detection() -> None:
-    assert _looks_like_subtitle_hallucination("字幕by索兰娅") is True
-    assert _looks_like_subtitle_hallucination("今天想聊聊童年往事") is False
-
-
-@pytest.mark.asyncio
-async def test_transcribe_retries_decode_audio_after_discarded_pass2(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    class DummyModel:
-        def __init__(self) -> None:
-            self.calls: list[object] = []
-
-        def transcribe(self, audio: object, **_: object):
-            self.calls.append(audio)
-            n = len(self.calls)
-            if n == 1:
-                return iter([]), SimpleNamespace()
-            if n == 2:
-                return iter([SimpleNamespace(text="字幕by索兰娅")]), SimpleNamespace()
-            if n == 3:
-                assert audio == "decoded-audio"
-                return (
-                    iter([SimpleNamespace(text="你好，今天想聊聊童年。")]),
-                    SimpleNamespace(),
-                )
-            raise AssertionError(f"unexpected transcribe call #{n}")
-
-    async def fake_to_thread(fn):
-        return fn()
-
-    def fake_decode_audio(_: str, sampling_rate: int = 16000):
-        assert sampling_rate == 16000
-        return "decoded-audio"
-
-    monkeypatch.setattr(asyncio, "to_thread", fake_to_thread)
-    monkeypatch.setitem(
-        sys.modules,
-        "faster_whisper",
-        SimpleNamespace(decode_audio=fake_decode_audio),
-    )
-
-    provider = WhisperASRProvider()
-    provider._model = DummyModel()
-
-    text = await provider.transcribe(b"fake-audio", format="m4a")
-
-    assert text == "你好，今天想聊聊童年。"
-    assert len(provider._model.calls) == 3