This commit is contained in:
Kevin
2026-04-28 10:41:48 +08:00
parent 482b016872
commit 15884bd68e
60 changed files with 2092 additions and 1994 deletions

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import array
import io
import shutil
import subprocess
@@ -9,12 +10,57 @@ import wave
from typing import Final
_BAIDU_RATE: Final[int] = 16000
# 诊室麦克风常见音量偏小,百度短语音 3301「语音质量错误」多与有效幅度过低有关。
_NORM_TARGET_PEAK: Final[int] = 12000
_NORM_MAX_GAIN: Final[float] = 80.0
class WavDecodeError(ValueError):
"""Uploaded bytes are not a valid WAV or cannot be converted."""
def pcm_s16le_to_wav_bytes(pcm: bytes, *, sample_rate: int = _BAIDU_RATE) -> bytes:
"""将 raw s16le mono PCM 打成标准 WAV供百度 ``format=wav`` 重试等场景。"""
if not pcm:
raise WavDecodeError("Empty PCM")
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(pcm)
return buf.getvalue()
def normalize_pcm_s16le_for_baidu(pcm: bytes) -> bytes:
"""提升过弱信号幅度,降低 err_no=3301speech quality概率已足够响的音频不改。"""
if len(pcm) < 2 or len(pcm) % 2 != 0:
return pcm
samples = array.array("h")
samples.frombytes(pcm)
if not samples:
return pcm
peak = 0
for s in samples:
a = abs(int(s))
if a > peak:
peak = a
if peak == 0 or peak >= _NORM_TARGET_PEAK:
return pcm
scale = min(_NORM_MAX_GAIN, float(_NORM_TARGET_PEAK) / float(peak))
if scale <= 1.0:
return pcm
out = array.array("h")
for s in samples:
v = int(round(float(s) * scale))
if v > 32767:
v = 32767
elif v < -32768:
v = -32768
out.append(v)
return out.tobytes()
def wav_bytes_to_pcm16k_mono_s16le(wav_bytes: bytes) -> bytes:
"""
Prefer ffmpeg for arbitrary channel count / sample rate.
@@ -57,7 +103,7 @@ def _ffmpeg_to_pcm16k(wav_bytes: bytes, ffmpeg: str) -> bytes:
raise WavDecodeError(f"ffmpeg wav decode failed: {err or proc.returncode}")
if not proc.stdout:
raise WavDecodeError("ffmpeg produced empty PCM")
return proc.stdout
return normalize_pcm_s16le_for_baidu(proc.stdout)
def _stdlib_wave_to_pcm16k(wav_bytes: bytes) -> bytes:
@@ -96,6 +142,6 @@ def _stdlib_wave_to_pcm16k(wav_bytes: bytes) -> bytes:
l_s, r_s = struct.unpack("<hh", chunk)
m = max(min((l_s + r_s) // 2, 32767), -32768)
out.extend(struct.pack("<h", m))
return bytes(out)
return normalize_pcm_s16le_for_baidu(bytes(out))
return raw
return normalize_pcm_s16le_for_baidu(raw)