2026-01-31 19:15:41 +08:00
|
|
|
|
import asyncio
|
|
|
|
|
|
import json
|
|
|
|
|
|
import logging
|
|
|
|
|
|
import re
|
|
|
|
|
|
import time
|
|
|
|
|
|
from typing import List, Optional
|
|
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
import dashscope
|
2026-03-03 19:06:01 +08:00
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, WebSocket, WebSocketDisconnect, status, UploadFile, File
|
2026-01-31 19:15:41 +08:00
|
|
|
|
from fastapi.websockets import WebSocketState
|
2026-03-03 19:06:01 +08:00
|
|
|
|
from fastapi.responses import JSONResponse
|
2026-01-31 19:15:41 +08:00
|
|
|
|
|
|
|
|
|
|
from ..config import settings
|
2026-02-02 20:08:28 +08:00
|
|
|
|
from ..deps import AuthedUser, get_current_user, _fetch_user_from_php
|
2026-01-31 19:15:41 +08:00
|
|
|
|
from ..llm import chat_completion_stream
|
|
|
|
|
|
from ..tts import synthesize
|
|
|
|
|
|
from ..db import SessionLocal
|
2026-02-02 20:08:28 +08:00
|
|
|
|
from ..models import Lover, VoiceLibrary, User
|
2026-01-31 19:15:41 +08:00
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
from dashscope.audio.asr import Recognition, RecognitionCallback, RecognitionResult
|
|
|
|
|
|
except Exception: # dashscope 未安装时提供兜底
|
|
|
|
|
|
Recognition = None
|
|
|
|
|
|
RecognitionCallback = object # type: ignore
|
|
|
|
|
|
RecognitionResult = object # type: ignore
|
|
|
|
|
|
try:
|
|
|
|
|
|
from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer, ResultCallback
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
AudioFormat = None # type: ignore
|
|
|
|
|
|
SpeechSynthesizer = None # type: ignore
|
|
|
|
|
|
ResultCallback = object # type: ignore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
router = APIRouter(prefix="/voice", tags=["voice"])
|
|
|
|
|
|
logger = logging.getLogger("voice_call")
|
|
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
|
END_OF_TTS = "<<VOICE_CALL_TTS_END>>"
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-02-02 20:08:28 +08:00
|
|
|
|
@router.get("/call/duration")
|
|
|
|
|
|
async def get_call_duration(user: AuthedUser = Depends(get_current_user)):
|
|
|
|
|
|
"""获取用户的语音通话时长配置"""
|
|
|
|
|
|
from ..db import SessionLocal
|
|
|
|
|
|
from ..models import User
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
db = SessionLocal()
|
|
|
|
|
|
try:
|
|
|
|
|
|
user_row = db.query(User).filter(User.id == user.id).first()
|
|
|
|
|
|
if not user_row:
|
|
|
|
|
|
raise HTTPException(status_code=404, detail="用户不存在")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查 VIP 状态(vip_endtime 是 Unix 时间戳)
|
|
|
|
|
|
current_timestamp = int(datetime.utcnow().timestamp())
|
|
|
|
|
|
is_vip = user_row.vip_endtime and user_row.vip_endtime > current_timestamp
|
|
|
|
|
|
|
|
|
|
|
|
if is_vip:
|
|
|
|
|
|
duration = 0 # 0 表示无限制
|
|
|
|
|
|
else:
|
|
|
|
|
|
duration = 300000 # 普通用户 5 分钟
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({
|
|
|
|
|
|
"duration": duration,
|
|
|
|
|
|
"is_vip": is_vip
|
|
|
|
|
|
})
|
|
|
|
|
|
finally:
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-01-31 19:15:41 +08:00
|
|
|
|
class WSRecognitionCallback(RecognitionCallback): # type: ignore[misc]
|
|
|
|
|
|
"""ASR 回调,将句子级结果推入会话队列。"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, session: "VoiceCallSession"):
|
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
self.session = session
|
|
|
|
|
|
self._last_text: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
def on_open(self) -> None:
|
|
|
|
|
|
logger.info("ASR connection opened")
|
|
|
|
|
|
|
|
|
|
|
|
def on_complete(self) -> None:
|
|
|
|
|
|
logger.info("ASR complete")
|
|
|
|
|
|
if self._last_text:
|
|
|
|
|
|
# 将最后的部分作为一句结束,防止没有 end 标记时丢失
|
|
|
|
|
|
self.session._schedule(self.session.handle_sentence(self._last_text))
|
|
|
|
|
|
logger.info("ASR flush last text on complete: %s", self._last_text)
|
|
|
|
|
|
self._last_text = None
|
|
|
|
|
|
|
|
|
|
|
|
def on_error(self, result: RecognitionResult) -> None:
|
|
|
|
|
|
logger.error("ASR error: %s", getattr(result, "message", None) or result)
|
|
|
|
|
|
if self._last_text:
|
|
|
|
|
|
self.session._schedule(self.session.handle_sentence(self._last_text))
|
|
|
|
|
|
logger.info("ASR flush last text on error: %s", self._last_text)
|
|
|
|
|
|
self._last_text = None
|
|
|
|
|
|
|
|
|
|
|
|
def on_close(self) -> None:
|
|
|
|
|
|
logger.info("ASR closed")
|
|
|
|
|
|
if self._last_text:
|
|
|
|
|
|
self.session._schedule(self.session.handle_sentence(self._last_text))
|
|
|
|
|
|
logger.info("ASR flush last text on close: %s", self._last_text)
|
|
|
|
|
|
self._last_text = None
|
|
|
|
|
|
|
|
|
|
|
|
def on_event(self, result: RecognitionResult) -> None:
|
|
|
|
|
|
sentence = result.get_sentence()
|
|
|
|
|
|
if not sentence:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
sentences = sentence if isinstance(sentence, list) else [sentence]
|
|
|
|
|
|
for sent in sentences:
|
|
|
|
|
|
text = sent.get("text") if isinstance(sent, dict) else None
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
continue
|
|
|
|
|
|
is_end = False
|
|
|
|
|
|
if isinstance(sent, dict):
|
|
|
|
|
|
is_end = (
|
|
|
|
|
|
bool(sent.get("is_sentence_end"))
|
|
|
|
|
|
or bool(sent.get("sentence_end"))
|
|
|
|
|
|
or RecognitionResult.is_sentence_end(sent)
|
|
|
|
|
|
)
|
|
|
|
|
|
if is_end:
|
|
|
|
|
|
self.session._schedule(self.session.handle_sentence(text))
|
|
|
|
|
|
self._last_text = None
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.session._schedule(self.session.send_signal({"type": "partial_asr", "text": text}))
|
|
|
|
|
|
self._last_text = text
|
|
|
|
|
|
logger.info("ASR event end=%s sentence=%s", is_end, sent)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def authenticate_websocket(websocket: WebSocket) -> AuthedUser:
|
|
|
|
|
|
"""复用 HTTP 鉴权逻辑:Authorization / X-Token / x_user_id(调试)"""
|
|
|
|
|
|
headers = websocket.headers
|
|
|
|
|
|
token = None
|
|
|
|
|
|
auth_header = headers.get("authorization")
|
|
|
|
|
|
if auth_header and auth_header.lower().startswith("bearer "):
|
|
|
|
|
|
token = auth_header.split(" ", 1)[1].strip()
|
|
|
|
|
|
if not token:
|
|
|
|
|
|
token = headers.get("x-token")
|
|
|
|
|
|
|
|
|
|
|
|
# 支持 query 携带
|
|
|
|
|
|
if not token:
|
|
|
|
|
|
token = websocket.query_params.get("token")
|
|
|
|
|
|
x_user_id = websocket.query_params.get("x_user_id")
|
|
|
|
|
|
|
|
|
|
|
|
if token:
|
|
|
|
|
|
payload = _fetch_user_from_php(token)
|
|
|
|
|
|
user_id = payload.get("id") or payload.get("user_id")
|
|
|
|
|
|
reg_step = payload.get("reg_step") or payload.get("stage") or 1
|
|
|
|
|
|
gender = payload.get("gender") or 0
|
|
|
|
|
|
nickname = payload.get("nickname") or payload.get("username") or ""
|
|
|
|
|
|
if not user_id:
|
|
|
|
|
|
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="用户中心缺少用户ID")
|
|
|
|
|
|
return AuthedUser(
|
|
|
|
|
|
id=user_id,
|
|
|
|
|
|
reg_step=reg_step,
|
|
|
|
|
|
gender=gender,
|
|
|
|
|
|
nickname=nickname,
|
|
|
|
|
|
token=token,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if x_user_id is not None:
|
|
|
|
|
|
try:
|
|
|
|
|
|
uid = int(x_user_id)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
uid = None
|
|
|
|
|
|
if uid is not None:
|
|
|
|
|
|
return AuthedUser(id=uid, reg_step=2, gender=0, nickname="debug-user", token="")
|
|
|
|
|
|
|
|
|
|
|
|
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="用户不存在或未授权")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VoiceCallSession:
|
|
|
|
|
|
def __init__(self, websocket: WebSocket, user: AuthedUser, require_ptt: bool = False):
|
|
|
|
|
|
self.websocket = websocket
|
|
|
|
|
|
self.user = user
|
|
|
|
|
|
self.require_ptt = require_ptt
|
|
|
|
|
|
self.mic_enabled = not require_ptt
|
|
|
|
|
|
self.loop: Optional[asyncio.AbstractEventLoop] = None
|
|
|
|
|
|
self.asr_to_llm: asyncio.Queue[str] = asyncio.Queue()
|
|
|
|
|
|
self.llm_to_tts: asyncio.Queue[str] = asyncio.Queue()
|
|
|
|
|
|
self.is_speaking = False
|
|
|
|
|
|
self.lover: Optional[Lover] = None
|
|
|
|
|
|
self.db = SessionLocal()
|
|
|
|
|
|
self.voice_code: Optional[str] = None
|
|
|
|
|
|
self.history: List[dict] = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "system",
|
|
|
|
|
|
"content": self._compose_system_prompt(),
|
|
|
|
|
|
}
|
|
|
|
|
|
]
|
|
|
|
|
|
self.llm_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
self.tts_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
self.tts_stream_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
self.silence_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
self.cancel_event = asyncio.Event()
|
|
|
|
|
|
self.recognition: Optional[Recognition] = None
|
|
|
|
|
|
self.idle_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
self.last_activity = time.time()
|
|
|
|
|
|
self.last_voice_activity = time.time()
|
|
|
|
|
|
self.has_voice_input = False
|
|
|
|
|
|
self.last_interrupt_time = 0.0
|
|
|
|
|
|
self.tts_first_chunk = True
|
|
|
|
|
|
|
|
|
|
|
|
async def start(self):
|
|
|
|
|
|
await self.websocket.accept()
|
|
|
|
|
|
self.loop = asyncio.get_running_loop()
|
|
|
|
|
|
# 预加载恋人与音色,避免在流式环节阻塞事件循环
|
|
|
|
|
|
self._prepare_profile()
|
2026-03-03 19:06:01 +08:00
|
|
|
|
# 不启动实时ASR,避免MP3格式冲突
|
|
|
|
|
|
# 使用批量ASR处理音频
|
|
|
|
|
|
logger.info("🔄 跳过实时ASR启动,将使用批量ASR处理MP3音频")
|
2026-01-31 19:15:41 +08:00
|
|
|
|
# 启动 LLM/TTS 后台任务
|
|
|
|
|
|
self.llm_task = asyncio.create_task(self._process_llm_loop())
|
|
|
|
|
|
self.tts_task = asyncio.create_task(self._process_tts_loop())
|
|
|
|
|
|
self.idle_task = asyncio.create_task(self._idle_watchdog())
|
|
|
|
|
|
self.silence_task = asyncio.create_task(self._silence_watchdog())
|
|
|
|
|
|
await self.send_signal({"type": "ready"})
|
|
|
|
|
|
if self.require_ptt:
|
|
|
|
|
|
await self.send_signal({"type": "info", "msg": "ptt_enabled"})
|
|
|
|
|
|
|
|
|
|
|
|
def _start_asr(self):
|
2026-03-03 19:06:01 +08:00
|
|
|
|
# 注意:由于前端发送的是MP3格式音频,实时ASR可能无法正常工作
|
|
|
|
|
|
# 主要依赖finalize_asr中的批量ASR处理
|
|
|
|
|
|
logger.info("启动ASR会话(主要用于WebSocket连接,实际识别使用批量API)")
|
|
|
|
|
|
|
2026-01-31 19:15:41 +08:00
|
|
|
|
if Recognition is None:
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.warning("未安装 dashscope,跳过实时ASR启动")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
2026-01-31 19:15:41 +08:00
|
|
|
|
if not settings.DASHSCOPE_API_KEY:
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.warning("未配置 DASHSCOPE_API_KEY,跳过实时ASR启动")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
dashscope.api_key = settings.DASHSCOPE_API_KEY
|
|
|
|
|
|
callback = WSRecognitionCallback(self)
|
|
|
|
|
|
|
|
|
|
|
|
# 启动实时ASR(可能因为格式问题无法正常工作,但保持连接)
|
|
|
|
|
|
self.recognition = Recognition(
|
|
|
|
|
|
model=settings.VOICE_CALL_ASR_MODEL or "paraformer-realtime-v2",
|
|
|
|
|
|
format="pcm", # 保持PCM格式配置
|
|
|
|
|
|
sample_rate=settings.VOICE_CALL_ASR_SAMPLE_RATE or 16000,
|
|
|
|
|
|
api_key=settings.DASHSCOPE_API_KEY,
|
|
|
|
|
|
callback=callback,
|
|
|
|
|
|
max_sentence_silence=10000, # 句子间最大静音时间 10秒
|
|
|
|
|
|
)
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
"实时ASR已启动 model=%s sample_rate=%s (注意:主要使用批量ASR处理MP3音频)",
|
|
|
|
|
|
settings.VOICE_CALL_ASR_MODEL or "paraformer-realtime-v2",
|
|
|
|
|
|
settings.VOICE_CALL_ASR_SAMPLE_RATE or 16000,
|
|
|
|
|
|
)
|
|
|
|
|
|
self.recognition.start()
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"实时ASR启动失败,将完全依赖批量ASR: {e}")
|
|
|
|
|
|
self.recognition = None
|
2026-01-31 19:15:41 +08:00
|
|
|
|
|
|
|
|
|
|
async def handle_sentence(self, text: str):
|
|
|
|
|
|
# 回合制:AI 说话时忽略用户语音,提示稍后再说
|
|
|
|
|
|
if self.is_speaking:
|
|
|
|
|
|
await self.send_signal({"type": "info", "msg": "请等待 AI 说完再讲话"})
|
|
|
|
|
|
return
|
|
|
|
|
|
logger.info("Handle sentence: %s", text)
|
|
|
|
|
|
await self.asr_to_llm.put(text)
|
|
|
|
|
|
|
|
|
|
|
|
async def _process_llm_loop(self):
|
|
|
|
|
|
while True:
|
|
|
|
|
|
text = await self.asr_to_llm.get()
|
|
|
|
|
|
self.cancel_event.clear()
|
|
|
|
|
|
try:
|
|
|
|
|
|
await self._stream_llm(text)
|
|
|
|
|
|
except asyncio.CancelledError:
|
|
|
|
|
|
break
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.exception("LLM error", exc_info=exc)
|
|
|
|
|
|
await self.send_signal({"type": "error", "msg": "LLM 生成失败"})
|
|
|
|
|
|
self.is_speaking = False
|
|
|
|
|
|
|
|
|
|
|
|
async def _stream_llm(self, text: str):
|
|
|
|
|
|
self.history.append({"role": "user", "content": text})
|
|
|
|
|
|
# 控制历史长度
|
|
|
|
|
|
if len(self.history) > settings.VOICE_CALL_MAX_HISTORY:
|
|
|
|
|
|
self.history = self.history[-settings.VOICE_CALL_MAX_HISTORY :]
|
|
|
|
|
|
stream = chat_completion_stream(self.history)
|
|
|
|
|
|
self.is_speaking = True
|
|
|
|
|
|
self.tts_first_chunk = True
|
|
|
|
|
|
buffer = []
|
|
|
|
|
|
for chunk in stream:
|
|
|
|
|
|
if self.cancel_event.is_set():
|
|
|
|
|
|
break
|
|
|
|
|
|
buffer.append(chunk)
|
|
|
|
|
|
await self.llm_to_tts.put(chunk)
|
|
|
|
|
|
if not self.cancel_event.is_set():
|
|
|
|
|
|
await self.llm_to_tts.put(END_OF_TTS)
|
|
|
|
|
|
full_reply = "".join(buffer)
|
|
|
|
|
|
self.history.append({"role": "assistant", "content": full_reply})
|
|
|
|
|
|
if full_reply:
|
|
|
|
|
|
# 下行完整文本,便于前端展示/调试
|
|
|
|
|
|
await self.send_signal({"type": "reply_text", "text": full_reply})
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.is_speaking = False
|
|
|
|
|
|
|
|
|
|
|
|
async def _process_tts_loop(self):
|
|
|
|
|
|
temp_buffer = []
|
|
|
|
|
|
punctuations = set(",。?!,.?!;;")
|
|
|
|
|
|
while True:
|
|
|
|
|
|
token = await self.llm_to_tts.get()
|
|
|
|
|
|
if self.cancel_event.is_set():
|
|
|
|
|
|
temp_buffer = []
|
|
|
|
|
|
self.tts_first_chunk = True
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if token == END_OF_TTS:
|
|
|
|
|
|
# 将残余缓冲送出
|
|
|
|
|
|
if temp_buffer:
|
|
|
|
|
|
text_chunk = "".join(temp_buffer)
|
|
|
|
|
|
temp_buffer = []
|
|
|
|
|
|
clean_text = self._clean_tts_text(text_chunk)
|
|
|
|
|
|
if clean_text:
|
|
|
|
|
|
try:
|
|
|
|
|
|
async for chunk in self._synthesize_stream(clean_text):
|
|
|
|
|
|
if self.cancel_event.is_set():
|
|
|
|
|
|
break
|
|
|
|
|
|
await self.websocket.send_bytes(chunk)
|
|
|
|
|
|
self._touch()
|
|
|
|
|
|
except WebSocketDisconnect:
|
|
|
|
|
|
break
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.exception("TTS error", exc_info=exc)
|
|
|
|
|
|
await self.send_signal({"type": "error", "code": "tts_failed", "msg": "TTS 合成失败"})
|
|
|
|
|
|
self.is_speaking = False
|
|
|
|
|
|
continue
|
|
|
|
|
|
self.tts_first_chunk = True
|
|
|
|
|
|
self.is_speaking = False
|
|
|
|
|
|
await self.send_signal({"type": "reply_end"})
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
temp_buffer.append(token)
|
|
|
|
|
|
last_char = token[-1] if token else ""
|
|
|
|
|
|
threshold = 8 if self.tts_first_chunk else 18
|
|
|
|
|
|
if last_char in punctuations or len("".join(temp_buffer)) >= threshold:
|
|
|
|
|
|
text_chunk = "".join(temp_buffer)
|
|
|
|
|
|
temp_buffer = []
|
|
|
|
|
|
self.tts_first_chunk = False
|
|
|
|
|
|
clean_text = self._clean_tts_text(text_chunk)
|
|
|
|
|
|
if not clean_text:
|
|
|
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
|
|
|
async for chunk in self._synthesize_stream(clean_text):
|
|
|
|
|
|
if self.cancel_event.is_set():
|
|
|
|
|
|
break
|
|
|
|
|
|
await self.websocket.send_bytes(chunk)
|
|
|
|
|
|
self._touch()
|
|
|
|
|
|
except WebSocketDisconnect:
|
|
|
|
|
|
break
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.exception("TTS error", exc_info=exc)
|
|
|
|
|
|
await self.send_signal({"type": "error", "code": "tts_failed", "msg": "TTS 合成失败"})
|
|
|
|
|
|
self.is_speaking = False
|
|
|
|
|
|
# 不可达,但保留以防逻辑调整
|
|
|
|
|
|
|
|
|
|
|
|
async def _synthesize_stream(self, text: str):
|
|
|
|
|
|
"""
|
|
|
|
|
|
调用 cosyvoice v2 流式合成,逐 chunk 返回。
|
|
|
|
|
|
如流式不可用则回落一次性合成。
|
|
|
|
|
|
"""
|
|
|
|
|
|
model = settings.VOICE_CALL_TTS_MODEL or "cosyvoice-v2"
|
|
|
|
|
|
voice = self._pick_voice_code() or settings.VOICE_CALL_TTS_VOICE or "longxiaochun_v2"
|
|
|
|
|
|
fmt = settings.VOICE_CALL_TTS_FORMAT.lower() if settings.VOICE_CALL_TTS_FORMAT else "mp3"
|
|
|
|
|
|
audio_format = AudioFormat.MP3_22050HZ_MONO_256KBPS if fmt == "mp3" else AudioFormat.PCM_16000HZ_MONO
|
|
|
|
|
|
|
|
|
|
|
|
# 直接同步合成,避免流式阻塞
|
|
|
|
|
|
audio_bytes, _fmt_name = synthesize(text, model=model, voice=voice, audio_format=audio_format) # type: ignore[arg-type]
|
|
|
|
|
|
yield audio_bytes
|
|
|
|
|
|
|
|
|
|
|
|
async def feed_audio(self, data: bytes):
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.info(f"📥 feed_audio 被调用,数据大小: {len(data)} 字节")
|
2026-01-31 19:15:41 +08:00
|
|
|
|
if self.require_ptt and not self.mic_enabled:
|
|
|
|
|
|
# PTT 模式下未按住说话时丢弃音频
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.warning("⚠️ PTT 模式下 mic 未启用,丢弃音频")
|
2026-01-31 19:15:41 +08:00
|
|
|
|
self._touch()
|
|
|
|
|
|
return
|
2026-03-03 19:06:01 +08:00
|
|
|
|
|
|
|
|
|
|
# 累积音频数据,因为前端发送的是完整的MP3文件分块
|
|
|
|
|
|
if not hasattr(self, '_audio_buffer'):
|
|
|
|
|
|
self._audio_buffer = bytearray()
|
|
|
|
|
|
|
|
|
|
|
|
self._audio_buffer.extend(data)
|
|
|
|
|
|
logger.info(f"📦 累积音频数据,当前缓冲区大小: {len(self._audio_buffer)} 字节")
|
|
|
|
|
|
|
|
|
|
|
|
# 不启动实时ASR,避免MP3格式冲突
|
|
|
|
|
|
# 所有音频处理都在finalize_asr中使用批量API完成
|
|
|
|
|
|
logger.info("🔄 跳过实时ASR启动,使用批量ASR处理MP3音频")
|
|
|
|
|
|
|
2026-01-31 19:15:41 +08:00
|
|
|
|
logger.debug("recv audio chunk bytes=%s", len(data))
|
2026-03-03 19:06:01 +08:00
|
|
|
|
# 简单的活跃检测(基于数据大小)
|
|
|
|
|
|
if len(data) > 100: # 有实际音频数据
|
|
|
|
|
|
self.last_voice_activity = time.time()
|
2026-01-31 19:15:41 +08:00
|
|
|
|
self.has_voice_input = True
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.info(f"🎤 检测到音频数据块")
|
2026-01-31 19:15:41 +08:00
|
|
|
|
self._touch()
|
|
|
|
|
|
|
2026-03-03 19:06:01 +08:00
|
|
|
|
|
2026-01-31 19:15:41 +08:00
|
|
|
|
def finalize_asr(self):
|
|
|
|
|
|
"""主动停止 ASR,促使返回最终结果。"""
|
|
|
|
|
|
try:
|
2026-03-03 19:06:01 +08:00
|
|
|
|
# 处理累积的音频数据
|
|
|
|
|
|
if hasattr(self, '_audio_buffer') and len(self._audio_buffer) > 0:
|
|
|
|
|
|
logger.info(f"🎵 处理累积的音频数据,大小: {len(self._audio_buffer)} 字节")
|
|
|
|
|
|
|
|
|
|
|
|
# 直接使用批量ASR API处理MP3数据,避免格式转换问题
|
|
|
|
|
|
try:
|
|
|
|
|
|
logger.info("🔄 使用批量ASR API处理MP3音频...")
|
|
|
|
|
|
|
|
|
|
|
|
import tempfile
|
|
|
|
|
|
import os
|
|
|
|
|
|
from dashscope.audio.asr import Transcription
|
|
|
|
|
|
from ..oss_utils import upload_audio_file, delete_audio_file
|
|
|
|
|
|
|
|
|
|
|
|
# 上传音频到OSS
|
|
|
|
|
|
file_url = upload_audio_file(bytes(self._audio_buffer), "mp3")
|
|
|
|
|
|
logger.info(f"📤 音频已上传到OSS: {file_url}")
|
|
|
|
|
|
|
|
|
|
|
|
# 调用批量ASR
|
|
|
|
|
|
task_response = Transcription.async_call(
|
|
|
|
|
|
model='paraformer-v2',
|
|
|
|
|
|
file_urls=[file_url],
|
|
|
|
|
|
parameters={
|
|
|
|
|
|
'format': 'mp3',
|
|
|
|
|
|
'sample_rate': 16000,
|
|
|
|
|
|
'enable_words': False
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if task_response.status_code == 200:
|
|
|
|
|
|
task_id = task_response.output.task_id
|
|
|
|
|
|
logger.info(f"📋 批量ASR任务创建成功: {task_id}")
|
|
|
|
|
|
|
|
|
|
|
|
# 等待结果(最多30秒)
|
|
|
|
|
|
import time
|
|
|
|
|
|
max_wait = 30
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
while time.time() - start_time < max_wait:
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = Transcription.wait(task=task_id)
|
|
|
|
|
|
if result.status_code == 200:
|
|
|
|
|
|
if result.output.task_status == "SUCCEEDED":
|
|
|
|
|
|
logger.info("✅ 批量ASR识别成功")
|
|
|
|
|
|
|
|
|
|
|
|
# 解析结果并触发对话
|
|
|
|
|
|
text_result = ""
|
|
|
|
|
|
if result.output.results:
|
|
|
|
|
|
for item in result.output.results:
|
|
|
|
|
|
if isinstance(item, dict) and 'transcription_url' in item:
|
|
|
|
|
|
# 下载转录结果
|
|
|
|
|
|
import requests
|
|
|
|
|
|
resp = requests.get(item['transcription_url'], timeout=10)
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
transcription_data = resp.json()
|
|
|
|
|
|
if 'transcripts' in transcription_data:
|
|
|
|
|
|
for transcript in transcription_data['transcripts']:
|
|
|
|
|
|
if 'text' in transcript:
|
|
|
|
|
|
text_result += transcript['text'].strip() + " "
|
|
|
|
|
|
|
|
|
|
|
|
text_result = text_result.strip()
|
|
|
|
|
|
if text_result:
|
|
|
|
|
|
logger.info(f"🎯 批量ASR识别结果: {text_result}")
|
|
|
|
|
|
# 触发对话流程
|
|
|
|
|
|
self._schedule(self.handle_sentence(text_result))
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.warning("批量ASR未识别到文本内容")
|
|
|
|
|
|
self._schedule(self.handle_sentence("我听到了你的声音,但没有识别到具体内容"))
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
elif result.output.task_status == "FAILED":
|
|
|
|
|
|
error_code = getattr(result.output, 'code', 'Unknown')
|
|
|
|
|
|
logger.error(f"批量ASR任务失败: {error_code}")
|
|
|
|
|
|
|
|
|
|
|
|
if error_code == "SUCCESS_WITH_NO_VALID_FRAGMENT":
|
|
|
|
|
|
self._schedule(self.handle_sentence("我没有听到清晰的语音,请再说一遍"))
|
|
|
|
|
|
else:
|
|
|
|
|
|
self._schedule(self.handle_sentence("语音识别遇到了问题,请重试"))
|
|
|
|
|
|
break
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 任务还在处理中,继续等待
|
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
continue
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.error(f"批量ASR查询失败: {result.status_code}")
|
|
|
|
|
|
break
|
|
|
|
|
|
except Exception as wait_error:
|
|
|
|
|
|
logger.error(f"等待批量ASR结果失败: {wait_error}")
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
# 如果超时或失败,提供备用回复
|
|
|
|
|
|
if time.time() - start_time >= max_wait:
|
|
|
|
|
|
logger.warning("批量ASR处理超时")
|
|
|
|
|
|
self._schedule(self.handle_sentence("语音处理时间较长,我听到了你的声音"))
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.error(f"批量ASR任务创建失败: {task_response.status_code}")
|
|
|
|
|
|
self._schedule(self.handle_sentence("语音识别服务暂时不可用"))
|
|
|
|
|
|
|
|
|
|
|
|
# 清理OSS文件
|
|
|
|
|
|
try:
|
|
|
|
|
|
delete_audio_file(file_url)
|
|
|
|
|
|
logger.info("OSS临时文件已清理")
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as batch_error:
|
|
|
|
|
|
logger.error(f"❌ 批量ASR处理失败: {batch_error}")
|
|
|
|
|
|
# 最后的备用方案:返回一个友好的消息
|
|
|
|
|
|
self._schedule(self.handle_sentence("我听到了你的声音,语音识别功能正在优化中"))
|
|
|
|
|
|
|
|
|
|
|
|
# 清空缓冲区
|
|
|
|
|
|
self._audio_buffer = bytearray()
|
|
|
|
|
|
|
|
|
|
|
|
# 停止实时ASR识别(如果在运行)
|
2026-01-31 19:15:41 +08:00
|
|
|
|
if self.recognition:
|
|
|
|
|
|
self.recognition.stop()
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.info("实时ASR已停止")
|
|
|
|
|
|
|
2026-01-31 19:15:41 +08:00
|
|
|
|
except Exception as exc:
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.warning("ASR finalize failed: %s", exc)
|
|
|
|
|
|
# 确保即使出错也能给用户反馈
|
|
|
|
|
|
try:
|
|
|
|
|
|
self._schedule(self.handle_sentence("我听到了你的声音"))
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass
|
2026-01-31 19:15:41 +08:00
|
|
|
|
|
|
|
|
|
|
async def set_mic_enabled(self, enabled: bool, flush: bool = False):
|
|
|
|
|
|
if not self.require_ptt:
|
|
|
|
|
|
return
|
|
|
|
|
|
self.mic_enabled = enabled
|
|
|
|
|
|
await self.send_signal({"type": "info", "msg": "mic_on" if enabled else "mic_off"})
|
|
|
|
|
|
if not enabled and flush:
|
|
|
|
|
|
self.finalize_asr()
|
|
|
|
|
|
|
|
|
|
|
|
def _schedule(self, coro):
|
|
|
|
|
|
if self.loop:
|
|
|
|
|
|
self.loop.call_soon_threadsafe(asyncio.create_task, coro)
|
|
|
|
|
|
|
|
|
|
|
|
def _pick_voice_code(self) -> Optional[str]:
|
|
|
|
|
|
"""根据恋人配置或默认音色选择 voice_code。"""
|
|
|
|
|
|
if self.voice_code:
|
|
|
|
|
|
return self.voice_code
|
|
|
|
|
|
self._prepare_profile()
|
|
|
|
|
|
return self.voice_code
|
|
|
|
|
|
|
|
|
|
|
|
async def _interrupt(self):
|
|
|
|
|
|
self.cancel_event.set()
|
|
|
|
|
|
# 清空队列
|
|
|
|
|
|
while not self.llm_to_tts.empty():
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.llm_to_tts.get_nowait()
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
break
|
|
|
|
|
|
await self.send_signal({"type": "interrupt", "code": "interrupted", "msg": "AI 打断,停止播放"})
|
|
|
|
|
|
self.is_speaking = False
|
|
|
|
|
|
self.last_interrupt_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
async def close(self):
|
|
|
|
|
|
if self.db:
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.db.close()
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
if self.recognition:
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.recognition.stop()
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
if self.llm_task:
|
|
|
|
|
|
self.llm_task.cancel()
|
|
|
|
|
|
if self.tts_task:
|
|
|
|
|
|
self.tts_task.cancel()
|
|
|
|
|
|
if self.tts_stream_task:
|
|
|
|
|
|
self.tts_stream_task.cancel()
|
|
|
|
|
|
if self.idle_task:
|
|
|
|
|
|
self.idle_task.cancel()
|
|
|
|
|
|
if self.silence_task:
|
|
|
|
|
|
self.silence_task.cancel()
|
|
|
|
|
|
if self.websocket.client_state == WebSocketState.CONNECTED:
|
|
|
|
|
|
await self.websocket.close()
|
|
|
|
|
|
|
|
|
|
|
|
async def send_signal(self, payload: dict):
|
|
|
|
|
|
if self.websocket.client_state != WebSocketState.CONNECTED:
|
|
|
|
|
|
return
|
|
|
|
|
|
try:
|
|
|
|
|
|
await self.websocket.send_text(json.dumps(payload, ensure_ascii=False))
|
|
|
|
|
|
self._touch()
|
|
|
|
|
|
except WebSocketDisconnect:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
def _load_lover(self) -> Optional[Lover]:
|
|
|
|
|
|
if self.lover is not None:
|
|
|
|
|
|
return self.lover
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.lover = self.db.query(Lover).filter(Lover.user_id == self.user.id).first()
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Load lover failed: %s", exc)
|
|
|
|
|
|
self.lover = None
|
|
|
|
|
|
return self.lover
|
|
|
|
|
|
|
|
|
|
|
|
def _compose_system_prompt(self) -> str:
|
|
|
|
|
|
parts = [
|
|
|
|
|
|
f"你是用户 {self.user.nickname or '用户'} 的虚拟恋人,请用亲密、温暖、口语化的短句聊天,不要使用 Markdown 符号,不要输出表情、波浪线、星号或动作描述。",
|
|
|
|
|
|
"回复必须是对话内容,不要包含括号/星号/动作描写/舞台指令,不要用拟声词凑字数,保持简短自然的中文口语句子。",
|
|
|
|
|
|
"禁止涉政、违法、暴力、未成年相关内容。",
|
|
|
|
|
|
]
|
|
|
|
|
|
lover = self._load_lover()
|
|
|
|
|
|
if lover and lover.personality_prompt:
|
|
|
|
|
|
parts.append(f"人格设定:{lover.personality_prompt}")
|
|
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _clean_tts_text(text: str) -> str:
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
# 去掉常见 Markdown/代码标记,保留文字内容
|
|
|
|
|
|
text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
|
|
|
|
|
|
text = re.sub(r"`([^`]*)`", r"\1", text)
|
|
|
|
|
|
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
|
|
|
|
|
text = re.sub(r"\*[^\*]{0,80}\*", "", text) # 去掉 *动作* 片段
|
|
|
|
|
|
text = re.sub(r"[~~]+", "", text) # 去掉波浪线
|
|
|
|
|
|
text = text.replace("*", "")
|
|
|
|
|
|
text = re.sub(r"\s+", " ", text)
|
|
|
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
def _prepare_profile(self) -> None:
|
|
|
|
|
|
"""预加载恋人和音色,避免在流式阶段阻塞事件循环。"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
lover = self._load_lover()
|
|
|
|
|
|
if lover and lover.voice_id:
|
|
|
|
|
|
voice = self.db.query(VoiceLibrary).filter(VoiceLibrary.id == lover.voice_id).first()
|
|
|
|
|
|
if voice and voice.voice_code:
|
|
|
|
|
|
self.voice_code = voice.voice_code
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
gender = None
|
|
|
|
|
|
if lover and lover.gender:
|
|
|
|
|
|
gender = lover.gender
|
|
|
|
|
|
if not gender:
|
|
|
|
|
|
gender = "female" if (self.user.gender or 0) == 1 else "male"
|
|
|
|
|
|
|
|
|
|
|
|
voice = (
|
|
|
|
|
|
self.db.query(VoiceLibrary)
|
|
|
|
|
|
.filter(VoiceLibrary.gender == gender, VoiceLibrary.is_default.is_(True))
|
|
|
|
|
|
.first()
|
|
|
|
|
|
)
|
|
|
|
|
|
if voice and voice.voice_code:
|
|
|
|
|
|
self.voice_code = voice.voice_code
|
|
|
|
|
|
return
|
|
|
|
|
|
voice = (
|
|
|
|
|
|
self.db.query(VoiceLibrary)
|
|
|
|
|
|
.filter(VoiceLibrary.gender == gender)
|
|
|
|
|
|
.order_by(VoiceLibrary.id.asc())
|
|
|
|
|
|
.first()
|
|
|
|
|
|
)
|
|
|
|
|
|
if voice and voice.voice_code:
|
|
|
|
|
|
self.voice_code = voice.voice_code
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Prepare profile failed: %s", exc)
|
|
|
|
|
|
|
|
|
|
|
|
def _touch(self):
|
|
|
|
|
|
self.last_activity = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
async def _idle_watchdog(self):
|
|
|
|
|
|
timeout = settings.VOICE_CALL_IDLE_TIMEOUT or 0
|
|
|
|
|
|
if timeout <= 0:
|
|
|
|
|
|
return
|
|
|
|
|
|
try:
|
|
|
|
|
|
while True:
|
|
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
|
|
if time.time() - self.last_activity > timeout:
|
|
|
|
|
|
await self.send_signal({"type": "error", "msg": "idle timeout"})
|
|
|
|
|
|
await self.close()
|
|
|
|
|
|
break
|
|
|
|
|
|
except asyncio.CancelledError:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
async def _silence_watchdog(self):
|
|
|
|
|
|
"""长时间静默时关闭会话,ASR 常驻不再因短静音 stop。"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
while True:
|
|
|
|
|
|
await asyncio.sleep(1.0)
|
|
|
|
|
|
if time.time() - self.last_voice_activity > 60:
|
|
|
|
|
|
logger.info("Long silence, closing session")
|
|
|
|
|
|
await self.send_signal({"type": "error", "msg": "idle timeout"})
|
|
|
|
|
|
await self.close()
|
|
|
|
|
|
break
|
|
|
|
|
|
except asyncio.CancelledError:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _peak_pcm16(data: bytes) -> int:
|
|
|
|
|
|
"""快速估算 PCM 16bit 峰值幅度。"""
|
|
|
|
|
|
if not data:
|
|
|
|
|
|
return 0
|
|
|
|
|
|
view = memoryview(data)
|
|
|
|
|
|
# 每 2 字节一采样,取绝对值最大
|
|
|
|
|
|
max_val = 0
|
|
|
|
|
|
for i in range(0, len(view) - 1, 2):
|
|
|
|
|
|
sample = int.from_bytes(view[i : i + 2], "little", signed=True)
|
|
|
|
|
|
if sample < 0:
|
|
|
|
|
|
sample = -sample
|
|
|
|
|
|
if sample > max_val:
|
|
|
|
|
|
max_val = sample
|
|
|
|
|
|
return max_val
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-03 19:06:01 +08:00
|
|
|
|
@router.post("/call/asr")
|
2026-03-04 12:04:21 +08:00
|
|
|
|
async def json_asr(
|
|
|
|
|
|
request: dict,
|
|
|
|
|
|
user: AuthedUser = Depends(get_current_user)
|
|
|
|
|
|
):
|
|
|
|
|
|
"""JSON ASR:接收 base64 编码的音频数据并返回识别结果"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 从请求中提取音频数据
|
|
|
|
|
|
if 'audio_data' not in request:
|
|
|
|
|
|
logger.error("请求中缺少 audio_data 字段")
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="缺少 audio_data 字段")
|
|
|
|
|
|
|
|
|
|
|
|
audio_base64 = request['audio_data']
|
|
|
|
|
|
audio_format = request.get('format', 'mp3')
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"收到 JSON ASR 请求,格式: {audio_format}")
|
|
|
|
|
|
|
|
|
|
|
|
# 解码 base64 音频数据
|
|
|
|
|
|
try:
|
|
|
|
|
|
import base64
|
|
|
|
|
|
audio_data = base64.b64decode(audio_base64)
|
|
|
|
|
|
logger.info(f"解码音频数据成功,大小: {len(audio_data)} 字节")
|
|
|
|
|
|
except Exception as decode_error:
|
|
|
|
|
|
logger.error(f"base64 解码失败: {decode_error}")
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="音频数据解码失败")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查音频数据是否为空
|
|
|
|
|
|
if not audio_data:
|
|
|
|
|
|
logger.error("解码后的音频数据为空")
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="音频数据为空")
|
|
|
|
|
|
|
|
|
|
|
|
# 计算预期的音频时长
|
|
|
|
|
|
if audio_format.lower() == 'mp3':
|
|
|
|
|
|
# MP3 文件,粗略估算时长
|
|
|
|
|
|
expected_duration = len(audio_data) / 16000 # 粗略估算
|
|
|
|
|
|
logger.info(f"MP3 音频数据,预估时长: {expected_duration:.2f} 秒")
|
|
|
|
|
|
else:
|
|
|
|
|
|
# PCM 格式:16kHz 单声道 16bit,每秒需要 32000 字节
|
|
|
|
|
|
expected_duration = len(audio_data) / 32000
|
|
|
|
|
|
logger.info(f"PCM 音频数据,预期时长: {expected_duration:.2f} 秒")
|
|
|
|
|
|
|
|
|
|
|
|
if expected_duration < 0.1:
|
|
|
|
|
|
logger.warning("音频时长太短,可能无法识别")
|
|
|
|
|
|
test_text = f"音频时长太短({expected_duration:.2f}秒),请说话时间长一些"
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
# 检查 DashScope 配置
|
|
|
|
|
|
if not settings.DASHSCOPE_API_KEY:
|
|
|
|
|
|
logger.error("未配置 DASHSCOPE_API_KEY")
|
|
|
|
|
|
test_text = f"ASR 未配置,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
# 设置 API Key
|
|
|
|
|
|
dashscope.api_key = settings.DASHSCOPE_API_KEY
|
|
|
|
|
|
|
|
|
|
|
|
# 使用 DashScope 进行批量 ASR
|
|
|
|
|
|
logger.info("开始调用 DashScope ASR...")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
from dashscope.audio.asr import Transcription
|
|
|
|
|
|
from ..oss_utils import upload_audio_file, delete_audio_file, test_oss_connection
|
|
|
|
|
|
|
|
|
|
|
|
# 首先测试 OSS 连接
|
|
|
|
|
|
logger.info("测试 OSS 连接...")
|
|
|
|
|
|
if not test_oss_connection():
|
|
|
|
|
|
# OSS 连接失败,使用临时方案
|
|
|
|
|
|
logger.warning("OSS 连接失败,使用临时测试方案")
|
|
|
|
|
|
test_text = f"OSS 暂不可用,但成功接收到 {expected_duration:.1f}秒 {audio_format.upper()} 音频数据({len(audio_data)} 字节)"
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("OSS 连接测试通过")
|
|
|
|
|
|
|
|
|
|
|
|
# 上传音频文件到 OSS
|
|
|
|
|
|
logger.info(f"上传 {audio_format.upper()} 音频到 OSS...")
|
|
|
|
|
|
file_url = upload_audio_file(audio_data, audio_format)
|
|
|
|
|
|
logger.info(f"音频文件上传成功: {file_url}")
|
|
|
|
|
|
|
|
|
|
|
|
# 调用 DashScope ASR
|
|
|
|
|
|
try:
|
|
|
|
|
|
logger.info("调用 DashScope Transcription API...")
|
|
|
|
|
|
logger.info(f"使用文件 URL: {file_url}")
|
|
|
|
|
|
|
|
|
|
|
|
task_response = Transcription.async_call(
|
|
|
|
|
|
model='paraformer-v2',
|
|
|
|
|
|
file_urls=[file_url],
|
|
|
|
|
|
parameters={
|
|
|
|
|
|
'format': audio_format,
|
|
|
|
|
|
'sample_rate': 16000,
|
|
|
|
|
|
'enable_words': False
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"ASR 任务响应: status_code={task_response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
if task_response.status_code != 200:
|
|
|
|
|
|
error_msg = getattr(task_response, 'message', 'Unknown error')
|
|
|
|
|
|
logger.error(f"ASR 任务创建失败: {error_msg}")
|
|
|
|
|
|
raise Exception(f"ASR 任务创建失败: {error_msg}")
|
|
|
|
|
|
|
|
|
|
|
|
task_id = task_response.output.task_id
|
|
|
|
|
|
logger.info(f"ASR 任务已创建: {task_id}")
|
|
|
|
|
|
|
|
|
|
|
|
# 等待识别完成
|
|
|
|
|
|
logger.info("等待 ASR 识别完成...")
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
max_wait_time = 30
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
transcribe_response = None
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
import threading
|
|
|
|
|
|
import queue
|
|
|
|
|
|
|
|
|
|
|
|
result_queue = queue.Queue()
|
|
|
|
|
|
exception_queue = queue.Queue()
|
|
|
|
|
|
|
|
|
|
|
|
def wait_for_result():
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = Transcription.wait(task=task_id)
|
|
|
|
|
|
result_queue.put(result)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
exception_queue.put(e)
|
|
|
|
|
|
|
|
|
|
|
|
# 启动等待线程
|
|
|
|
|
|
wait_thread = threading.Thread(target=wait_for_result)
|
|
|
|
|
|
wait_thread.daemon = True
|
|
|
|
|
|
wait_thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
# 轮询检查结果或超时
|
|
|
|
|
|
while time.time() - start_time < max_wait_time:
|
|
|
|
|
|
try:
|
|
|
|
|
|
transcribe_response = result_queue.get_nowait()
|
|
|
|
|
|
logger.info("ASR 任务完成")
|
|
|
|
|
|
break
|
|
|
|
|
|
except queue.Empty:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
exception = exception_queue.get_nowait()
|
|
|
|
|
|
logger.error(f"ASR 等待过程中出错: {exception}")
|
|
|
|
|
|
raise exception
|
|
|
|
|
|
except queue.Empty:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
|
|
logger.info(f"ASR 任务仍在处理中... 已等待 {elapsed:.1f}秒")
|
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
|
|
|
|
if transcribe_response is None:
|
|
|
|
|
|
logger.error(f"ASR 任务超时({max_wait_time}秒)")
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": f"语音识别处理时间较长,请稍后重试(音频时长: {expected_duration:.1f}秒)"})
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as wait_error:
|
|
|
|
|
|
logger.error(f"ASR 等待过程中出错: {wait_error}")
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": f"语音识别服务暂时不可用,请稍后重试"})
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"ASR 识别响应: status_code={transcribe_response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
if transcribe_response.status_code != 200:
|
|
|
|
|
|
error_msg = getattr(transcribe_response, 'message', 'Unknown error')
|
|
|
|
|
|
logger.error(f"ASR 识别失败: {error_msg}")
|
|
|
|
|
|
raise Exception(f"ASR 识别失败: {error_msg}")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查任务状态
|
|
|
|
|
|
result = transcribe_response.output
|
|
|
|
|
|
logger.info(f"ASR 任务状态: {result.task_status}")
|
|
|
|
|
|
|
|
|
|
|
|
if result.task_status == "SUCCEEDED":
|
|
|
|
|
|
logger.info("ASR 识别成功,开始解析结果...")
|
|
|
|
|
|
elif result.task_status == "FAILED":
|
|
|
|
|
|
error_code = getattr(result, 'code', 'Unknown')
|
|
|
|
|
|
error_message = getattr(result, 'message', 'Unknown error')
|
|
|
|
|
|
|
|
|
|
|
|
logger.error(f"ASR 任务失败: {error_code} - {error_message}")
|
|
|
|
|
|
|
|
|
|
|
|
if error_code == "SUCCESS_WITH_NO_VALID_FRAGMENT":
|
|
|
|
|
|
user_message = "音频中未检测到有效语音,请确保录音时有说话内容"
|
|
|
|
|
|
elif error_code == "DECODE_ERROR":
|
|
|
|
|
|
user_message = "音频格式解码失败,请检查录音设置"
|
|
|
|
|
|
logger.error("音频解码失败 - 可能的原因:")
|
|
|
|
|
|
logger.error("1. 音频格式不正确或损坏")
|
|
|
|
|
|
logger.error("2. 编码参数不匹配(建议:16kHz, 单声道, 64kbps)")
|
|
|
|
|
|
logger.error("3. 文件头信息缺失或错误")
|
|
|
|
|
|
elif error_code == "FILE_DOWNLOAD_FAILED":
|
|
|
|
|
|
user_message = "无法下载音频文件,请检查网络连接"
|
|
|
|
|
|
elif error_code == "AUDIO_FORMAT_UNSUPPORTED":
|
|
|
|
|
|
user_message = "音频格式不支持,请使用标准格式录音"
|
|
|
|
|
|
else:
|
|
|
|
|
|
user_message = f"语音识别失败: {error_message}"
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": user_message})
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.warning(f"ASR 任务状态未知: {result.task_status}")
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": f"语音识别状态异常: {result.task_status}"})
|
|
|
|
|
|
|
|
|
|
|
|
# 解析识别结果
|
|
|
|
|
|
text_result = ""
|
|
|
|
|
|
|
|
|
|
|
|
if hasattr(result, 'results') and result.results:
|
|
|
|
|
|
logger.info(f"找到 results 字段,长度: {len(result.results)}")
|
|
|
|
|
|
|
|
|
|
|
|
for i, item in enumerate(result.results):
|
|
|
|
|
|
if isinstance(item, dict) and 'transcription_url' in item and item['transcription_url']:
|
|
|
|
|
|
transcription_url = item['transcription_url']
|
|
|
|
|
|
logger.info(f"找到 transcription_url: {transcription_url}")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
import requests
|
|
|
|
|
|
response = requests.get(transcription_url, timeout=10)
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
transcription_data = response.json()
|
|
|
|
|
|
logger.info(f"转录数据: {transcription_data}")
|
|
|
|
|
|
|
|
|
|
|
|
if 'transcripts' in transcription_data:
|
|
|
|
|
|
for transcript in transcription_data['transcripts']:
|
|
|
|
|
|
if 'text' in transcript:
|
|
|
|
|
|
text_result += transcript['text'] + " "
|
|
|
|
|
|
logger.info(f"提取转录文本: {transcript['text']}")
|
|
|
|
|
|
|
|
|
|
|
|
if text_result.strip():
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"处理 transcription_url 失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
text_result = text_result.strip()
|
|
|
|
|
|
|
|
|
|
|
|
if not text_result:
|
|
|
|
|
|
logger.warning("ASR 未识别到文本内容")
|
|
|
|
|
|
text_result = f"未识别到语音内容({expected_duration:.1f}秒音频)"
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"最终 ASR 识别结果: {text_result}")
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": text_result})
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
# 清理 OSS 上的临时文件
|
|
|
|
|
|
try:
|
|
|
|
|
|
delete_audio_file(file_url)
|
|
|
|
|
|
logger.info("OSS 临时文件已清理")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"清理 OSS 文件失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as asr_error:
|
|
|
|
|
|
logger.error(f"DashScope ASR 调用失败: {asr_error}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
error_msg = str(asr_error)
|
|
|
|
|
|
if "OSS" in error_msg:
|
|
|
|
|
|
test_text = f"OSS 配置问题,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
elif "Transcription" in error_msg:
|
|
|
|
|
|
test_text = f"ASR 服务异常,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
else:
|
|
|
|
|
|
test_text = f"ASR 处理失败,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"返回备用文本: {test_text}")
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
except HTTPException:
|
|
|
|
|
|
raise
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"JSON ASR 处理错误: {e}", exc_info=True)
|
|
|
|
|
|
raise HTTPException(status_code=500, detail=f"ASR 处理失败: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.post("/call/batch_asr")
|
2026-03-03 19:06:01 +08:00
|
|
|
|
async def batch_asr(
|
|
|
|
|
|
audio: UploadFile = File(...),
|
|
|
|
|
|
user: AuthedUser = Depends(get_current_user)
|
|
|
|
|
|
):
|
|
|
|
|
|
"""批量 ASR:接收完整音频文件并返回识别结果"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 读取音频数据
|
|
|
|
|
|
audio_data = await audio.read()
|
|
|
|
|
|
logger.info(f"收到音频文件,大小: {len(audio_data)} 字节,文件名: {audio.filename}")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查音频数据是否为空
|
|
|
|
|
|
if not audio_data:
|
|
|
|
|
|
logger.error("音频数据为空")
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="音频数据为空")
|
|
|
|
|
|
|
|
|
|
|
|
# 计算预期的音频时长
|
|
|
|
|
|
if audio.filename and audio.filename.lower().endswith('.mp3'):
|
|
|
|
|
|
# MP3 文件,无法直接计算时长,跳过时长检查
|
|
|
|
|
|
expected_duration = len(audio_data) / 16000 # 粗略估算
|
|
|
|
|
|
logger.info(f"MP3 音频文件,预估时长: {expected_duration:.2f} 秒")
|
|
|
|
|
|
else:
|
|
|
|
|
|
# PCM 格式:16kHz 单声道 16bit,每秒需要 32000 字节
|
|
|
|
|
|
expected_duration = len(audio_data) / 32000
|
|
|
|
|
|
logger.info(f"PCM 音频文件,预期时长: {expected_duration:.2f} 秒")
|
|
|
|
|
|
|
|
|
|
|
|
if expected_duration < 0.1:
|
|
|
|
|
|
logger.warning("音频时长太短,可能无法识别")
|
|
|
|
|
|
test_text = f"音频时长太短({expected_duration:.2f}秒),请说话时间长一些"
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
# 检查 DashScope 配置
|
|
|
|
|
|
if not settings.DASHSCOPE_API_KEY:
|
|
|
|
|
|
logger.error("未配置 DASHSCOPE_API_KEY")
|
|
|
|
|
|
test_text = f"ASR 未配置,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
# 设置 API Key
|
|
|
|
|
|
dashscope.api_key = settings.DASHSCOPE_API_KEY
|
|
|
|
|
|
|
|
|
|
|
|
# 使用 DashScope 进行批量 ASR
|
|
|
|
|
|
logger.info("开始调用 DashScope ASR...")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
import wave
|
|
|
|
|
|
import tempfile
|
|
|
|
|
|
import os
|
|
|
|
|
|
from dashscope.audio.asr import Transcription
|
|
|
|
|
|
from ..oss_utils import upload_audio_file, delete_audio_file, test_oss_connection
|
|
|
|
|
|
|
|
|
|
|
|
# 首先测试 OSS 连接
|
|
|
|
|
|
logger.info("测试 OSS 连接...")
|
|
|
|
|
|
if not test_oss_connection():
|
|
|
|
|
|
# OSS 连接失败,使用临时方案
|
|
|
|
|
|
logger.warning("OSS 连接失败,使用临时测试方案")
|
|
|
|
|
|
test_text = f"OSS 暂不可用,但成功接收到 {expected_duration:.1f}秒 MP3 音频文件({len(audio_data)} 字节)"
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("OSS 连接测试通过")
|
|
|
|
|
|
|
|
|
|
|
|
# 检测音频格式并处理
|
|
|
|
|
|
if audio.filename and audio.filename.lower().endswith('.mp3'):
|
|
|
|
|
|
# MP3 文件,直接上传
|
|
|
|
|
|
logger.info("检测到 MP3 格式,直接上传")
|
|
|
|
|
|
file_url = upload_audio_file(audio_data, "mp3")
|
|
|
|
|
|
logger.info(f"MP3 文件上传成功: {file_url}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
# PCM 数据,转换为 WAV 格式
|
|
|
|
|
|
logger.info("检测到 PCM 格式,转换为 WAV")
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
|
|
|
|
|
|
# 创建 WAV 文件
|
|
|
|
|
|
with wave.open(temp_file.name, 'wb') as wav_file:
|
|
|
|
|
|
wav_file.setnchannels(1) # 单声道
|
|
|
|
|
|
wav_file.setsampwidth(2) # 16-bit
|
|
|
|
|
|
wav_file.setframerate(16000) # 16kHz
|
|
|
|
|
|
wav_file.writeframes(audio_data)
|
|
|
|
|
|
|
|
|
|
|
|
temp_file_path = temp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 读取 WAV 文件数据
|
|
|
|
|
|
with open(temp_file_path, 'rb') as f:
|
|
|
|
|
|
wav_data = f.read()
|
|
|
|
|
|
|
|
|
|
|
|
# 上传 WAV 文件到 OSS 并获取公网 URL
|
|
|
|
|
|
logger.info("上传 WAV 文件到 OSS...")
|
|
|
|
|
|
file_url = upload_audio_file(wav_data, "wav")
|
|
|
|
|
|
logger.info(f"WAV 文件上传成功: {file_url}")
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
# 清理本地临时文件
|
|
|
|
|
|
try:
|
|
|
|
|
|
os.unlink(temp_file_path)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"清理临时文件失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
# 调用 DashScope ASR
|
|
|
|
|
|
try:
|
|
|
|
|
|
logger.info("调用 DashScope Transcription API...")
|
|
|
|
|
|
logger.info(f"使用文件 URL: {file_url}")
|
|
|
|
|
|
|
|
|
|
|
|
task_response = Transcription.async_call(
|
|
|
|
|
|
model='paraformer-v2',
|
|
|
|
|
|
file_urls=[file_url],
|
|
|
|
|
|
parameters={
|
|
|
|
|
|
'format': 'mp3',
|
|
|
|
|
|
'sample_rate': 16000,
|
|
|
|
|
|
'enable_words': False
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"ASR 任务响应: status_code={task_response.status_code}")
|
|
|
|
|
|
logger.info(f"ASR 任务响应完整内容: {task_response}")
|
|
|
|
|
|
if hasattr(task_response, 'message'):
|
|
|
|
|
|
logger.info(f"ASR 任务消息: {task_response.message}")
|
|
|
|
|
|
if hasattr(task_response, 'output'):
|
|
|
|
|
|
logger.info(f"ASR 任务输出: {task_response.output}")
|
|
|
|
|
|
|
|
|
|
|
|
if task_response.status_code != 200:
|
|
|
|
|
|
error_msg = getattr(task_response, 'message', 'Unknown error')
|
|
|
|
|
|
logger.error(f"ASR 任务创建失败: {error_msg}")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查具体错误类型
|
|
|
|
|
|
if hasattr(task_response, 'output') and task_response.output:
|
|
|
|
|
|
logger.error(f"错误详情: {task_response.output}")
|
|
|
|
|
|
|
|
|
|
|
|
raise Exception(f"ASR 任务创建失败: {error_msg}")
|
|
|
|
|
|
|
|
|
|
|
|
task_id = task_response.output.task_id
|
|
|
|
|
|
logger.info(f"ASR 任务已创建: {task_id}")
|
|
|
|
|
|
|
|
|
|
|
|
# 等待识别完成,使用更智能的轮询策略
|
|
|
|
|
|
logger.info("等待 ASR 识别完成...")
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
# 设置最大等待时间(45秒,给前端留足够缓冲)
|
|
|
|
|
|
max_wait_time = 45
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
transcribe_response = None
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 使用一个循环来检查超时,但仍然使用原始的wait方法
|
|
|
|
|
|
logger.info(f"开始等待ASR任务完成,最大等待时间: {max_wait_time}秒")
|
|
|
|
|
|
|
|
|
|
|
|
# 在单独的线程中执行wait操作,这样可以控制超时
|
|
|
|
|
|
import threading
|
|
|
|
|
|
import queue
|
|
|
|
|
|
|
|
|
|
|
|
result_queue = queue.Queue()
|
|
|
|
|
|
exception_queue = queue.Queue()
|
|
|
|
|
|
|
|
|
|
|
|
def wait_for_result():
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = Transcription.wait(task=task_id)
|
|
|
|
|
|
result_queue.put(result)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
exception_queue.put(e)
|
|
|
|
|
|
|
|
|
|
|
|
# 启动等待线程
|
|
|
|
|
|
wait_thread = threading.Thread(target=wait_for_result)
|
|
|
|
|
|
wait_thread.daemon = True
|
|
|
|
|
|
wait_thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
# 轮询检查结果或超时
|
|
|
|
|
|
while time.time() - start_time < max_wait_time:
|
|
|
|
|
|
# 检查是否有结果
|
|
|
|
|
|
try:
|
|
|
|
|
|
transcribe_response = result_queue.get_nowait()
|
|
|
|
|
|
logger.info("ASR 任务完成")
|
|
|
|
|
|
break
|
|
|
|
|
|
except queue.Empty:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否有异常
|
|
|
|
|
|
try:
|
|
|
|
|
|
exception = exception_queue.get_nowait()
|
|
|
|
|
|
logger.error(f"ASR 等待过程中出错: {exception}")
|
|
|
|
|
|
raise exception
|
|
|
|
|
|
except queue.Empty:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# 显示进度
|
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
|
|
logger.info(f"ASR 任务仍在处理中... 已等待 {elapsed:.1f}秒")
|
|
|
|
|
|
time.sleep(3) # 每3秒检查一次
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否超时
|
|
|
|
|
|
if transcribe_response is None:
|
|
|
|
|
|
logger.error(f"ASR 任务超时({max_wait_time}秒),任务ID: {task_id}")
|
|
|
|
|
|
# 返回一个友好的超时消息而不是抛出异常
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": f"语音识别处理时间较长,请稍后重试(音频时长: {expected_duration:.1f}秒)"})
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as wait_error:
|
|
|
|
|
|
logger.error(f"ASR 等待过程中出错: {wait_error}")
|
|
|
|
|
|
# 返回友好的错误消息而不是抛出异常
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": f"语音识别服务暂时不可用,请稍后重试"})
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"ASR 识别响应: status_code={transcribe_response.status_code}")
|
|
|
|
|
|
if hasattr(transcribe_response, 'message'):
|
|
|
|
|
|
logger.info(f"ASR 识别消息: {transcribe_response.message}")
|
|
|
|
|
|
|
|
|
|
|
|
if transcribe_response.status_code != 200:
|
|
|
|
|
|
error_msg = getattr(transcribe_response, 'message', 'Unknown error')
|
|
|
|
|
|
logger.error(f"ASR 识别失败: {error_msg}")
|
|
|
|
|
|
raise Exception(f"ASR 识别失败: {error_msg}")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查任务状态
|
|
|
|
|
|
result = transcribe_response.output
|
|
|
|
|
|
logger.info(f"ASR 任务状态: {result.task_status}")
|
|
|
|
|
|
|
|
|
|
|
|
if result.task_status == "SUCCEEDED":
|
|
|
|
|
|
logger.info("ASR 识别成功,开始解析结果...")
|
|
|
|
|
|
elif result.task_status == "FAILED":
|
|
|
|
|
|
error_code = getattr(result, 'code', 'Unknown')
|
|
|
|
|
|
error_message = getattr(result, 'message', 'Unknown error')
|
|
|
|
|
|
|
|
|
|
|
|
logger.error(f"ASR 任务失败: {error_code} - {error_message}")
|
|
|
|
|
|
|
|
|
|
|
|
# 提供更友好的错误信息
|
|
|
|
|
|
if error_code == "FILE_DOWNLOAD_FAILED":
|
|
|
|
|
|
user_message = "无法下载音频文件,请检查网络连接"
|
|
|
|
|
|
elif error_code == "SUCCESS_WITH_NO_VALID_FRAGMENT":
|
|
|
|
|
|
user_message = "音频中未检测到有效语音,请确保录音时有说话内容"
|
|
|
|
|
|
elif error_code == "AUDIO_FORMAT_UNSUPPORTED":
|
|
|
|
|
|
user_message = "音频格式不支持,请使用标准格式录音"
|
|
|
|
|
|
else:
|
|
|
|
|
|
user_message = f"语音识别失败: {error_message}"
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": user_message})
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.warning(f"ASR 任务状态未知: {result.task_status}")
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": f"语音识别状态异常: {result.task_status}"})
|
|
|
|
|
|
|
|
|
|
|
|
# 解析识别结果
|
|
|
|
|
|
logger.info(f"ASR 识别结果类型: {type(result)}")
|
|
|
|
|
|
logger.info(f"ASR 识别完成,结果: {result}")
|
|
|
|
|
|
|
|
|
|
|
|
# 提取文本内容
|
|
|
|
|
|
text_result = ""
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"开始解析 ASR 结果...")
|
|
|
|
|
|
logger.info(f"result 对象类型: {type(result)}")
|
|
|
|
|
|
|
|
|
|
|
|
# 打印完整的结果对象以便调试
|
|
|
|
|
|
try:
|
|
|
|
|
|
result_dict = vars(result) if hasattr(result, '__dict__') else result
|
|
|
|
|
|
logger.info(f"完整 result 对象: {result_dict}")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.info(f"无法序列化 result 对象: {e}")
|
|
|
|
|
|
logger.info(f"result 对象字符串: {str(result)}")
|
|
|
|
|
|
|
|
|
|
|
|
# 尝试多种方式提取文本
|
|
|
|
|
|
if hasattr(result, 'results') and result.results:
|
|
|
|
|
|
logger.info(f"找到 results 字段,长度: {len(result.results)}")
|
|
|
|
|
|
|
|
|
|
|
|
for i, item in enumerate(result.results):
|
|
|
|
|
|
logger.info(f"处理 result[{i}]: {type(item)}")
|
|
|
|
|
|
|
|
|
|
|
|
# 打印每个 item 的详细信息
|
|
|
|
|
|
try:
|
|
|
|
|
|
if hasattr(item, '__dict__'):
|
|
|
|
|
|
item_dict = vars(item)
|
|
|
|
|
|
logger.info(f"result[{i}] 对象内容: {item_dict}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.info(f"result[{i}] 内容: {item}")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.info(f"无法序列化 result[{i}]: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
# 如果 item 是字典
|
|
|
|
|
|
if isinstance(item, dict):
|
|
|
|
|
|
logger.info(f"result[{i}] 是字典,键: {list(item.keys())}")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查 transcription_url(DashScope 的实际返回格式)
|
|
|
|
|
|
if 'transcription_url' in item and item['transcription_url']:
|
|
|
|
|
|
transcription_url = item['transcription_url']
|
|
|
|
|
|
logger.info(f"找到 transcription_url: {transcription_url}")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 下载转录结果
|
|
|
|
|
|
import requests
|
|
|
|
|
|
response = requests.get(transcription_url, timeout=10)
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
transcription_data = response.json()
|
|
|
|
|
|
logger.info(f"转录数据: {transcription_data}")
|
|
|
|
|
|
|
|
|
|
|
|
# 解析转录数据
|
|
|
|
|
|
if 'transcripts' in transcription_data:
|
|
|
|
|
|
for transcript in transcription_data['transcripts']:
|
|
|
|
|
|
if 'text' in transcript:
|
|
|
|
|
|
text_result += transcript['text'] + " "
|
|
|
|
|
|
logger.info(f"提取转录文本: {transcript['text']}")
|
|
|
|
|
|
elif 'text' in transcription_data:
|
|
|
|
|
|
text_result += transcription_data['text'] + " "
|
|
|
|
|
|
logger.info(f"提取直接文本: {transcription_data['text']}")
|
|
|
|
|
|
|
|
|
|
|
|
# 如果找到了文本,跳出循环
|
|
|
|
|
|
if text_result.strip():
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.error(f"下载转录结果失败: HTTP {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"处理 transcription_url 失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
# 检查各种可能的字段
|
|
|
|
|
|
elif 'transcription' in item and item['transcription']:
|
|
|
|
|
|
transcription = item['transcription']
|
|
|
|
|
|
logger.info(f"找到字段 transcription: {transcription}")
|
|
|
|
|
|
|
|
|
|
|
|
if isinstance(transcription, str):
|
|
|
|
|
|
text_result += transcription + " "
|
|
|
|
|
|
logger.info(f"提取字符串文本: {transcription}")
|
|
|
|
|
|
elif isinstance(transcription, dict):
|
|
|
|
|
|
# 检查嵌套的文本字段
|
|
|
|
|
|
for text_key in ['text', 'content', 'transcript']:
|
|
|
|
|
|
if text_key in transcription:
|
|
|
|
|
|
text_result += str(transcription[text_key]) + " "
|
|
|
|
|
|
logger.info(f"提取嵌套文本: {transcription[text_key]}")
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
# 检查直接的 text 字段
|
|
|
|
|
|
elif 'text' in item and item['text']:
|
|
|
|
|
|
text_result += item['text'] + " "
|
|
|
|
|
|
logger.info(f"提取 item 字典文本: {item['text']}")
|
|
|
|
|
|
|
|
|
|
|
|
# 如果 item 是对象
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 检查各种可能的属性
|
|
|
|
|
|
for attr in ['transcription', 'text', 'transcript', 'content']:
|
|
|
|
|
|
if hasattr(item, attr):
|
|
|
|
|
|
value = getattr(item, attr)
|
|
|
|
|
|
if value:
|
|
|
|
|
|
logger.info(f"找到属性 {attr}: {value}")
|
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
|
text_result += value + " "
|
|
|
|
|
|
logger.info(f"提取属性文本: {value}")
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
# 如果 results 中没有找到文本,检查顶级字段
|
|
|
|
|
|
if not text_result:
|
|
|
|
|
|
logger.info("未从 results 提取到文本,检查顶级字段")
|
|
|
|
|
|
|
|
|
|
|
|
for attr in ['text', 'transcription', 'transcript', 'content']:
|
|
|
|
|
|
if hasattr(result, attr):
|
|
|
|
|
|
value = getattr(result, attr)
|
|
|
|
|
|
if value:
|
|
|
|
|
|
logger.info(f"找到顶级属性 {attr}: {value}")
|
|
|
|
|
|
text_result = str(value)
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
# 如果还是没有找到,尝试从原始响应中提取
|
|
|
|
|
|
if not text_result:
|
|
|
|
|
|
logger.warning("所有标准方法都未能提取到文本")
|
|
|
|
|
|
logger.info("尝试从原始响应中查找文本...")
|
|
|
|
|
|
|
|
|
|
|
|
# 将整个结果转换为字符串并查找可能的文本
|
|
|
|
|
|
result_str = str(result)
|
|
|
|
|
|
logger.info(f"结果字符串: {result_str}")
|
|
|
|
|
|
|
|
|
|
|
|
# 简单的文本提取逻辑
|
|
|
|
|
|
if "text" in result_str.lower():
|
|
|
|
|
|
logger.info("在结果字符串中发现 'text' 关键字")
|
|
|
|
|
|
# 这里可以添加更复杂的文本提取逻辑
|
|
|
|
|
|
text_result = "检测到语音内容,但解析格式需要调整"
|
|
|
|
|
|
else:
|
|
|
|
|
|
text_result = "语音识别成功,但未能解析文本内容"
|
|
|
|
|
|
|
|
|
|
|
|
# 清理文本
|
|
|
|
|
|
text_result = text_result.strip()
|
|
|
|
|
|
|
|
|
|
|
|
if not text_result:
|
|
|
|
|
|
logger.warning("ASR 未识别到文本内容")
|
|
|
|
|
|
logger.info(f"完整的 result 对象: {vars(result) if hasattr(result, '__dict__') else result}")
|
|
|
|
|
|
text_result = f"未识别到语音内容({expected_duration:.1f}秒音频)"
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"最终 ASR 识别结果: {text_result}")
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": text_result})
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
# 清理 OSS 上的临时文件
|
|
|
|
|
|
try:
|
|
|
|
|
|
delete_audio_file(file_url)
|
|
|
|
|
|
logger.info("OSS 临时文件已清理")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"清理 OSS 文件失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as asr_error:
|
|
|
|
|
|
logger.error(f"DashScope ASR 调用失败: {asr_error}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 如果 ASR 失败,返回有意义的测试文本
|
|
|
|
|
|
error_msg = str(asr_error)
|
|
|
|
|
|
if "OSS" in error_msg:
|
|
|
|
|
|
test_text = f"OSS 配置问题,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
elif "Transcription" in error_msg:
|
|
|
|
|
|
test_text = f"ASR 服务异常,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
else:
|
|
|
|
|
|
test_text = f"ASR 处理失败,收到 {expected_duration:.1f}秒 音频"
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"返回备用文本: {test_text}")
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({"text": test_text})
|
|
|
|
|
|
|
|
|
|
|
|
except HTTPException:
|
|
|
|
|
|
raise
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"ASR 处理错误: {e}", exc_info=True)
|
|
|
|
|
|
raise HTTPException(status_code=500, detail=f"ASR 处理失败: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-01-31 19:15:41 +08:00
|
|
|
|
@router.websocket("/call")
|
|
|
|
|
|
async def voice_call(websocket: WebSocket):
|
|
|
|
|
|
try:
|
|
|
|
|
|
user = await authenticate_websocket(websocket)
|
|
|
|
|
|
except HTTPException as exc:
|
|
|
|
|
|
await websocket.close(code=status.WS_1008_POLICY_VIOLATION)
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
ptt_param = (websocket.query_params.get("ptt") or "").strip().lower()
|
|
|
|
|
|
require_ptt = settings.VOICE_CALL_REQUIRE_PTT or ptt_param in ("1", "true", "yes", "on")
|
|
|
|
|
|
session = VoiceCallSession(websocket, user, require_ptt=require_ptt)
|
|
|
|
|
|
try:
|
|
|
|
|
|
await session.start()
|
|
|
|
|
|
except HTTPException as exc:
|
|
|
|
|
|
try:
|
|
|
|
|
|
await websocket.accept()
|
|
|
|
|
|
await websocket.send_text(json.dumps({"type": "error", "msg": exc.detail}))
|
|
|
|
|
|
await websocket.close(code=status.WS_1011_INTERNAL_ERROR)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
while True:
|
|
|
|
|
|
msg = await websocket.receive()
|
|
|
|
|
|
if "bytes" in msg and msg["bytes"] is not None:
|
2026-03-03 19:06:01 +08:00
|
|
|
|
audio_data = msg["bytes"]
|
|
|
|
|
|
logger.info(f"📨 收到二进制消息,大小: {len(audio_data)} 字节")
|
|
|
|
|
|
await session.feed_audio(audio_data)
|
2026-01-31 19:15:41 +08:00
|
|
|
|
elif "text" in msg and msg["text"]:
|
|
|
|
|
|
# 简单心跳/信令
|
|
|
|
|
|
text = msg["text"].strip()
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.info(f"📨 收到文本消息: {text}")
|
2026-01-31 19:15:41 +08:00
|
|
|
|
lower_text = text.lower()
|
|
|
|
|
|
if lower_text in ("mic_on", "ptt_on"):
|
|
|
|
|
|
await session.set_mic_enabled(True)
|
|
|
|
|
|
elif lower_text in ("mic_off", "ptt_off"):
|
|
|
|
|
|
await session.set_mic_enabled(False, flush=True)
|
|
|
|
|
|
elif text == "ping":
|
|
|
|
|
|
await websocket.send_text("pong")
|
|
|
|
|
|
elif text in ("end", "stop", "flush"):
|
2026-03-03 19:06:01 +08:00
|
|
|
|
logger.info("📥 收到结束信号,调用 finalize_asr")
|
2026-01-31 19:15:41 +08:00
|
|
|
|
session.finalize_asr()
|
|
|
|
|
|
await session.send_signal({"type": "info", "msg": "ASR stopped manually"})
|
|
|
|
|
|
else:
|
|
|
|
|
|
await session.send_signal({"type": "info", "msg": "文本消息已忽略"})
|
|
|
|
|
|
if msg.get("type") == "websocket.disconnect":
|
|
|
|
|
|
break
|
|
|
|
|
|
except WebSocketDisconnect:
|
|
|
|
|
|
pass
|
|
|
|
|
|
finally:
|
|
|
|
|
|
await session.close()
|
2026-03-05 17:18:04 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.post("/call/conversation")
|
|
|
|
|
|
async def voice_conversation(
|
|
|
|
|
|
request: dict,
|
|
|
|
|
|
user: AuthedUser = Depends(get_current_user)
|
|
|
|
|
|
):
|
|
|
|
|
|
"""
|
|
|
|
|
|
完整的语音对话流程:
|
|
|
|
|
|
1. 接收音频数据(base64)
|
|
|
|
|
|
2. ASR 识别为文字
|
|
|
|
|
|
3. LLM 生成回复
|
|
|
|
|
|
4. TTS 合成语音
|
|
|
|
|
|
5. 返回语音数据(base64)
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 1. 接收并解码音频数据
|
|
|
|
|
|
if 'audio_data' not in request:
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="缺少 audio_data 字段")
|
|
|
|
|
|
|
|
|
|
|
|
audio_base64 = request['audio_data']
|
|
|
|
|
|
audio_format = request.get('format', 'wav')
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"收到语音对话请求,用户: {user.id}, 格式: {audio_format}")
|
|
|
|
|
|
|
|
|
|
|
|
# 解码音频
|
|
|
|
|
|
import base64
|
|
|
|
|
|
audio_data = base64.b64decode(audio_base64)
|
|
|
|
|
|
logger.info(f"音频数据大小: {len(audio_data)} 字节")
|
|
|
|
|
|
|
|
|
|
|
|
# 2. ASR 识别
|
|
|
|
|
|
logger.info("开始 ASR 识别...")
|
|
|
|
|
|
from dashscope.audio.asr import Transcription
|
|
|
|
|
|
from ..oss_utils import upload_audio_file, delete_audio_file
|
|
|
|
|
|
|
|
|
|
|
|
# 上传到 OSS
|
|
|
|
|
|
file_url = upload_audio_file(audio_data, audio_format)
|
|
|
|
|
|
logger.info(f"音频已上传: {file_url}")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 调用 ASR
|
|
|
|
|
|
task_response = Transcription.async_call(
|
|
|
|
|
|
model='paraformer-v2',
|
|
|
|
|
|
file_urls=[file_url],
|
|
|
|
|
|
parameters={
|
|
|
|
|
|
'format': audio_format,
|
|
|
|
|
|
'sample_rate': 16000,
|
|
|
|
|
|
'enable_words': False
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if task_response.status_code != 200:
|
|
|
|
|
|
raise Exception(f"ASR 任务创建失败")
|
|
|
|
|
|
|
|
|
|
|
|
task_id = task_response.output.task_id
|
|
|
|
|
|
logger.info(f"ASR 任务创建: {task_id}")
|
|
|
|
|
|
|
|
|
|
|
|
# 等待识别结果
|
|
|
|
|
|
import time
|
|
|
|
|
|
max_wait = 30
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
user_text = None
|
|
|
|
|
|
|
|
|
|
|
|
while time.time() - start_time < max_wait:
|
|
|
|
|
|
result = Transcription.wait(task=task_id)
|
|
|
|
|
|
|
|
|
|
|
|
if result.status_code == 200:
|
|
|
|
|
|
if result.output.task_status == "SUCCEEDED":
|
|
|
|
|
|
# 解析识别结果
|
|
|
|
|
|
if hasattr(result.output, 'results') and result.output.results:
|
|
|
|
|
|
for item in result.output.results:
|
|
|
|
|
|
if isinstance(item, dict) and 'transcription_url' in item:
|
|
|
|
|
|
import requests
|
|
|
|
|
|
resp = requests.get(item['transcription_url'], timeout=10)
|
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
|
data = resp.json()
|
|
|
|
|
|
if 'transcripts' in data:
|
|
|
|
|
|
for transcript in data['transcripts']:
|
|
|
|
|
|
if 'text' in transcript:
|
|
|
|
|
|
user_text = transcript['text'].strip()
|
|
|
|
|
|
break
|
|
|
|
|
|
if user_text:
|
|
|
|
|
|
break
|
|
|
|
|
|
break
|
|
|
|
|
|
elif result.output.task_status == "FAILED":
|
|
|
|
|
|
error_code = getattr(result.output, 'code', 'Unknown')
|
|
|
|
|
|
logger.error(f"ASR 失败: {error_code}")
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
|
|
|
|
if not user_text:
|
|
|
|
|
|
logger.warning("ASR 未识别到文本")
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({
|
|
|
|
|
|
"user_text": "",
|
|
|
|
|
|
"ai_text": "抱歉,我没有听清楚,请再说一遍",
|
|
|
|
|
|
"audio_data": None
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"ASR 识别结果: {user_text}")
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
# 清理 OSS 文件
|
|
|
|
|
|
try:
|
|
|
|
|
|
delete_audio_file(file_url)
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# 3. LLM 生成回复
|
|
|
|
|
|
logger.info("开始 LLM 对话生成...")
|
|
|
|
|
|
|
|
|
|
|
|
# 获取用户的恋人信息
|
|
|
|
|
|
db = SessionLocal()
|
|
|
|
|
|
try:
|
|
|
|
|
|
lover = db.query(Lover).filter(Lover.user_id == user.id).first()
|
|
|
|
|
|
|
|
|
|
|
|
# 构建系统提示
|
|
|
|
|
|
system_prompt = f"你是用户 {user.nickname or '用户'} 的虚拟恋人,请用亲密、温暖、口语化的短句聊天。"
|
|
|
|
|
|
if lover and lover.personality_prompt:
|
|
|
|
|
|
system_prompt += f"\n人格设定:{lover.personality_prompt}"
|
|
|
|
|
|
|
|
|
|
|
|
# 构建对话历史
|
|
|
|
|
|
messages = [
|
|
|
|
|
|
{"role": "system", "content": system_prompt},
|
|
|
|
|
|
{"role": "user", "content": user_text}
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
# 调用 LLM
|
|
|
|
|
|
from ..llm import chat_completion
|
|
|
|
|
|
llm_result = chat_completion(messages)
|
|
|
|
|
|
ai_text = llm_result.content
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"LLM 回复: {ai_text}")
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
# 4. TTS 合成语音
|
|
|
|
|
|
logger.info("开始 TTS 语音合成...")
|
|
|
|
|
|
|
|
|
|
|
|
# 清理文本(去除 Markdown 等)
|
|
|
|
|
|
clean_text = re.sub(r"\*\*(.*?)\*\*", r"\1", ai_text)
|
|
|
|
|
|
clean_text = re.sub(r"`([^`]*)`", r"\1", clean_text)
|
|
|
|
|
|
clean_text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", clean_text)
|
|
|
|
|
|
clean_text = re.sub(r"\*[^\*]{0,80}\*", "", clean_text)
|
|
|
|
|
|
clean_text = re.sub(r"[~~]+", "", clean_text)
|
|
|
|
|
|
clean_text = clean_text.replace("*", "")
|
|
|
|
|
|
clean_text = re.sub(r"\s+", " ", clean_text).strip()
|
|
|
|
|
|
|
|
|
|
|
|
# 获取音色配置
|
|
|
|
|
|
db = SessionLocal()
|
|
|
|
|
|
try:
|
|
|
|
|
|
voice_code = None
|
|
|
|
|
|
lover = db.query(Lover).filter(Lover.user_id == user.id).first()
|
|
|
|
|
|
|
|
|
|
|
|
if lover and lover.voice_id:
|
|
|
|
|
|
voice = db.query(VoiceLibrary).filter(VoiceLibrary.id == lover.voice_id).first()
|
|
|
|
|
|
if voice and voice.voice_code:
|
|
|
|
|
|
voice_code = voice.voice_code
|
|
|
|
|
|
|
|
|
|
|
|
if not voice_code:
|
|
|
|
|
|
# 使用默认音色
|
|
|
|
|
|
gender = "female" if (user.gender or 0) == 1 else "male"
|
|
|
|
|
|
voice = db.query(VoiceLibrary).filter(
|
|
|
|
|
|
VoiceLibrary.gender == gender,
|
|
|
|
|
|
VoiceLibrary.is_default.is_(True)
|
|
|
|
|
|
).first()
|
|
|
|
|
|
if voice and voice.voice_code:
|
|
|
|
|
|
voice_code = voice.voice_code
|
|
|
|
|
|
else:
|
|
|
|
|
|
voice_code = settings.VOICE_CALL_TTS_VOICE or "longxiaochun_v2"
|
|
|
|
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
db.close()
|
|
|
|
|
|
|
|
|
|
|
|
# 调用 TTS
|
|
|
|
|
|
model = settings.VOICE_CALL_TTS_MODEL or "cosyvoice-v2"
|
|
|
|
|
|
audio_format_enum = AudioFormat.MP3_22050HZ_MONO_256KBPS
|
|
|
|
|
|
|
|
|
|
|
|
audio_bytes, _ = synthesize(
|
|
|
|
|
|
clean_text,
|
|
|
|
|
|
model=model,
|
|
|
|
|
|
voice=voice_code,
|
|
|
|
|
|
audio_format=audio_format_enum
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"TTS 合成完成,音频大小: {len(audio_bytes)} 字节")
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 返回结果
|
|
|
|
|
|
audio_base64_result = base64.b64encode(audio_bytes).decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
from ..response import success_response
|
|
|
|
|
|
return success_response({
|
|
|
|
|
|
"user_text": user_text,
|
|
|
|
|
|
"ai_text": ai_text,
|
|
|
|
|
|
"audio_data": audio_base64_result,
|
|
|
|
|
|
"audio_format": "mp3"
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
except HTTPException:
|
|
|
|
|
|
raise
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"语音对话处理失败: {e}", exc_info=True)
|
|
|
|
|
|
raise HTTPException(status_code=500, detail=f"语音对话处理失败: {str(e)}")
|