""" CosyVoice TTS 封装,返回二进制音频数据。 """ import base64 from typing import Optional, Tuple import dashscope from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer from fastapi import HTTPException from .config import settings def synthesize( text: str, *, model: str, voice: str, audio_format: AudioFormat = AudioFormat.MP3_22050HZ_MONO_256KBPS, ) -> Tuple[bytes, str]: """ 同步调用 cosyvoice,返回 (音频二进制, 格式标识)。 """ api_key = settings.DASHSCOPE_API_KEY if not api_key: raise HTTPException(status_code=500, detail="未配置 TTS API Key") dashscope.api_key = api_key resp_obj: Optional[object] = None try: synthesizer = SpeechSynthesizer( model=model, voice=voice, format=audio_format, ) resp_obj = synthesizer.call(text) except HTTPException: raise except Exception as exc: # SDK/网络错误 raise HTTPException(status_code=502, detail=f"TTS 调用失败: {exc}") from exc # 官方非流式调用直接返回音频二进制;兜底处理 base64/字典返回 audio_bytes: bytes = b"" if isinstance(resp_obj, (bytes, bytearray)): audio_bytes = bytes(resp_obj) elif isinstance(resp_obj, str): try: audio_bytes = base64.b64decode(resp_obj) except Exception: audio_bytes = b"" else: output = getattr(resp_obj, "output", None) if isinstance(output, dict): audio_raw = output.get("audio") or output.get("audio_data") or output.get("audio_url") if isinstance(audio_raw, (bytes, bytearray)): audio_bytes = bytes(audio_raw) elif isinstance(audio_raw, str): try: audio_bytes = base64.b64decode(audio_raw) except Exception: audio_bytes = b"" if not audio_bytes: raise HTTPException(status_code=502, detail="TTS 未返回音频数据") return audio_bytes, audio_format.name if hasattr(audio_format, "name") else str(audio_format)