Ai_GirlFriend/lover/tts.py

66 lines
2.1 KiB
Python
Raw Normal View History

2026-01-31 19:15:41 +08:00
"""
CosyVoice TTS 封装返回二进制音频数据
"""
import base64
from typing import Optional, Tuple
import dashscope
from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer
from fastapi import HTTPException
from .config import settings
def synthesize(
text: str,
*,
model: str,
voice: str,
audio_format: AudioFormat = AudioFormat.MP3_22050HZ_MONO_256KBPS,
) -> Tuple[bytes, str]:
"""
同步调用 cosyvoice返回 (音频二进制, 格式标识)
"""
api_key = settings.DASHSCOPE_API_KEY
if not api_key:
raise HTTPException(status_code=500, detail="未配置 TTS API Key")
dashscope.api_key = api_key
resp_obj: Optional[object] = None
try:
synthesizer = SpeechSynthesizer(
model=model,
voice=voice,
format=audio_format,
)
resp_obj = synthesizer.call(text)
except HTTPException:
raise
except Exception as exc: # SDK/网络错误
raise HTTPException(status_code=502, detail=f"TTS 调用失败: {exc}") from exc
# 官方非流式调用直接返回音频二进制;兜底处理 base64/字典返回
audio_bytes: bytes = b""
if isinstance(resp_obj, (bytes, bytearray)):
audio_bytes = bytes(resp_obj)
elif isinstance(resp_obj, str):
try:
audio_bytes = base64.b64decode(resp_obj)
except Exception:
audio_bytes = b""
else:
output = getattr(resp_obj, "output", None)
if isinstance(output, dict):
audio_raw = output.get("audio") or output.get("audio_data") or output.get("audio_url")
if isinstance(audio_raw, (bytes, bytearray)):
audio_bytes = bytes(audio_raw)
elif isinstance(audio_raw, str):
try:
audio_bytes = base64.b64decode(audio_raw)
except Exception:
audio_bytes = b""
if not audio_bytes:
raise HTTPException(status_code=502, detail="TTS 未返回音频数据")
return audio_bytes, audio_format.name if hasattr(audio_format, "name") else str(audio_format)