feat(api): use Tencent 16k_zh_large ASR and remove local Whisper
Standardize ASR on Tencent's dialect-capable engine across all environments, drop faster-whisper from dependencies and deployment images, and add an expo-sqlite iOS vendor sync plus pod install in prebuild to prevent native build failures after npm install. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -16,8 +16,7 @@ def test_default_toml_matches_legacy_settings_defaults() -> None:
|
||||
|
||||
assert cfg.story.image_min_body_chars == 400
|
||||
|
||||
assert cfg.asr.provider == "whisper"
|
||||
assert cfg.asr.device == "auto"
|
||||
assert cfg.asr.compute_type == "auto"
|
||||
assert cfg.asr.provider == "tencent"
|
||||
assert cfg.asr.engine_type == "16k_zh_large"
|
||||
|
||||
assert cfg.misc.tencent_sms_template_param_count == 2
|
||||
|
||||
@@ -86,7 +86,7 @@ async def test_tencent_asr_transcribe_uses_to_thread(
|
||||
monkeypatch.setitem(sys.modules, "tencentcloud.asr.v20190614", package_module)
|
||||
monkeypatch.setattr(asyncio, "to_thread", fake_to_thread)
|
||||
|
||||
provider = TencentASRProvider("sid", "skey")
|
||||
provider = TencentASRProvider("sid", "skey", engine_type="16k_zh_large")
|
||||
client = FakeClient()
|
||||
monkeypatch.setattr(provider, "_get_client", lambda: client)
|
||||
|
||||
@@ -98,5 +98,6 @@ async def test_tencent_asr_transcribe_uses_to_thread(
|
||||
assert getattr(fn, "__self__", None) is client
|
||||
assert getattr(fn, "__name__", "") == "SentenceRecognition"
|
||||
request = args[0]
|
||||
assert request.EngSerViceType == "16k_zh_large"
|
||||
assert request.VoiceFormat == "m4a"
|
||||
assert request.DataLen == len(b"fake-audio")
|
||||
|
||||
@@ -1,61 +0,0 @@
|
||||
import asyncio
|
||||
import sys
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from app.adapters.asr.whisper_local import (
|
||||
WhisperASRProvider,
|
||||
_looks_like_subtitle_hallucination,
|
||||
)
|
||||
|
||||
|
||||
def test_subtitle_watermark_detection() -> None:
|
||||
assert _looks_like_subtitle_hallucination("字幕by索兰娅") is True
|
||||
assert _looks_like_subtitle_hallucination("今天想聊聊童年往事") is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_transcribe_retries_decode_audio_after_discarded_pass2(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
class DummyModel:
|
||||
def __init__(self) -> None:
|
||||
self.calls: list[object] = []
|
||||
|
||||
def transcribe(self, audio: object, **_: object):
|
||||
self.calls.append(audio)
|
||||
n = len(self.calls)
|
||||
if n == 1:
|
||||
return iter([]), SimpleNamespace()
|
||||
if n == 2:
|
||||
return iter([SimpleNamespace(text="字幕by索兰娅")]), SimpleNamespace()
|
||||
if n == 3:
|
||||
assert audio == "decoded-audio"
|
||||
return (
|
||||
iter([SimpleNamespace(text="你好,今天想聊聊童年。")]),
|
||||
SimpleNamespace(),
|
||||
)
|
||||
raise AssertionError(f"unexpected transcribe call #{n}")
|
||||
|
||||
async def fake_to_thread(fn):
|
||||
return fn()
|
||||
|
||||
def fake_decode_audio(_: str, sampling_rate: int = 16000):
|
||||
assert sampling_rate == 16000
|
||||
return "decoded-audio"
|
||||
|
||||
monkeypatch.setattr(asyncio, "to_thread", fake_to_thread)
|
||||
monkeypatch.setitem(
|
||||
sys.modules,
|
||||
"faster_whisper",
|
||||
SimpleNamespace(decode_audio=fake_decode_audio),
|
||||
)
|
||||
|
||||
provider = WhisperASRProvider()
|
||||
provider._model = DummyModel()
|
||||
|
||||
text = await provider.transcribe(b"fake-audio", format="m4a")
|
||||
|
||||
assert text == "你好,今天想聊聊童年。"
|
||||
assert len(provider._model.calls) == 3
|
||||
Reference in New Issue
Block a user