guoyu/Test/python/vosk_speech_server.py

306 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Vosk 语音识别服务
轻量级、易安装、支持离线
"""
from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import json
import wave
from difflib import SequenceMatcher
app = Flask(__name__)
CORS(app)
# 全局变量
vosk_model = None
model_loaded = False
def init_vosk_model():
"""初始化Vosk模型"""
global vosk_model, model_loaded
try:
from vosk import Model, KaldiRecognizer
# 将KaldiRecognizer设为全局变量以便后续使用
globals()['KaldiRecognizer'] = KaldiRecognizer
model_path = "./vosk-model-small-cn-0.22"
if not os.path.exists(model_path):
print(f"[错误] 模型不存在: {model_path}")
print("请下载模型https://alphacephei.com/vosk/models/vosk-model-small-cn-0.22.zip")
return False
print(f"正在加载模型: {model_path}")
vosk_model = Model(model_path)
model_loaded = True
print("✓ 模型加载成功!")
return True
except Exception as e:
print(f"✗ 模型加载失败: {str(e)}")
model_loaded = False
return False
def convert_audio_to_wav(input_path, output_path):
"""转换音频为WAV格式使用pydub"""
try:
from pydub import AudioSegment
print(f"[转换] 使用pydub转换音频...")
# 加载音频(自动检测格式)
audio = AudioSegment.from_file(input_path)
# 转换为单声道、16kHz、16位WAV
audio = audio.set_channels(1) # 单声道
audio = audio.set_frame_rate(16000) # 16kHz采样率
audio = audio.set_sample_width(2) # 16位
# 导出为WAV
audio.export(output_path, format='wav')
print(f"[转换] pydub转换成功")
return True, None
except Exception as e:
error_msg = str(e).lower()
# 检查是否是ffmpeg未安装的错误
if 'ffmpeg' in error_msg or 'ffprobe' in error_msg or 'filenotfounderror' in error_msg:
return False, (
"需要安装 ffmpeg 才能转换音频格式。\n"
"请下载 ffmpeg: https://www.gyan.dev/ffmpeg/builds/\n"
"或运行: pip install ffmpeg-python\n"
"错误详情: " + str(e)
)
else:
return False, f"音频转换失败: {str(e)}"
def recognize_audio(audio_path):
"""识别音频文件"""
converted_path = None
try:
# 先尝试直接打开WAV文件
print(f"[识别] 尝试打开音频文件: {audio_path}")
try:
wf = wave.open(audio_path, "rb")
is_valid_wav = True
print(f"[识别] 文件是有效的WAV格式")
except Exception as e:
is_valid_wav = False
print(f"[识别] 不是有效的WAV格式: {str(e)}")
# 如果不是有效的WAV尝试转换
if not is_valid_wav:
print(f"[识别] 检测到非WAV格式开始转换...")
converted_path = audio_path + '.converted.wav'
success, error = convert_audio_to_wav(audio_path, converted_path)
if not success:
print(f"[识别] 转换失败: {error}")
return None, f"音频格式转换失败: {error}"
print(f"[识别] 转换成功: {converted_path}")
# 使用转换后的文件
audio_path = converted_path
wf = wave.open(audio_path, "rb")
print(f"[识别] 转换后的文件已打开")
# 检查音频参数
if wf.getnchannels() != 1:
wf.close()
return None, "音频必须是单声道"
if wf.getsampwidth() != 2:
wf.close()
return None, "音频必须是16位"
if wf.getframerate() not in [8000, 16000, 32000, 48000]:
wf.close()
return None, f"不支持的采样率: {wf.getframerate()}"
# 创建识别器
rec = KaldiRecognizer(vosk_model, wf.getframerate())
rec.SetWords(True)
result_text = ""
# 读取并识别
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
text = result.get('text', '')
if text:
result_text += text + " "
# 获取最终结果
final_result = json.loads(rec.FinalResult())
final_text = final_result.get('text', '')
if final_text:
result_text += final_text
wf.close()
# 清理转换后的临时文件
if converted_path and os.path.exists(converted_path):
try:
os.remove(converted_path)
print(f"[识别] 已清理转换文件: {converted_path}")
except Exception as e:
print(f"[识别] 清理转换文件失败: {e}")
result_text = result_text.strip()
return result_text, None
except Exception as e:
# 发生错误时也清理转换文件
if converted_path and os.path.exists(converted_path):
try:
os.remove(converted_path)
except:
pass
return None, str(e)
def calculate_similarity(text1, text2):
"""计算文本相似度0-100分"""
if not text1 or not text2:
return 0
# 去除空格
text1 = text1.replace(" ", "")
text2 = text2.replace(" ", "")
if not text1 or not text2:
return 0
# 计算相似度
similarity = SequenceMatcher(None, text1, text2).ratio()
return round(similarity * 100, 2)
@app.route('/api/speech/recognize', methods=['POST'])
def recognize():
"""语音识别接口"""
try:
# 检查模型是否加载
if not model_loaded:
return jsonify({
'code': 500,
'msg': '模型未加载,请检查服务器日志'
}), 500
# 检查文件
if 'audio' not in request.files:
return jsonify({
'code': 400,
'msg': '未上传音频文件'
}), 400
audio_file = request.files['audio']
reference_text = request.form.get('referenceText', '')
# 保存临时文件
temp_dir = './temp_audio'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
import time
timestamp = str(int(time.time() * 1000))
temp_path = os.path.join(temp_dir, f'audio_{timestamp}.wav')
audio_file.save(temp_path)
print(f"收到音频: {temp_path}")
print(f"参考文本: {reference_text}")
# 识别音频
recognized_text, error = recognize_audio(temp_path)
# 删除临时文件
try:
os.remove(temp_path)
except:
pass
if error:
return jsonify({
'code': 500,
'msg': f'识别失败: {error}'
}), 500
if not recognized_text:
return jsonify({
'code': 500,
'msg': '未识别到有效语音'
}), 500
# 计算评分
score = calculate_similarity(recognized_text, reference_text)
pronunciation_score = max(0, score - 5)
fluency_score = max(0, score - 3)
print(f"识别结果: {recognized_text}")
print(f"相似度: {score}")
return jsonify({
'code': 200,
'msg': '成功',
'data': {
'recognizedText': recognized_text,
'score': score,
'pronunciationScore': pronunciation_score,
'fluencyScore': fluency_score,
'status': 'completed'
}
})
except Exception as e:
print(f"处理错误: {str(e)}")
return jsonify({
'code': 500,
'msg': f'处理失败: {str(e)}'
}), 500
@app.route('/api/speech/health', methods=['GET'])
def health():
"""健康检查"""
return jsonify({
'code': 200,
'msg': '服务正常',
'data': {
'model_loaded': model_loaded,
'engine': 'vosk'
}
})
if __name__ == '__main__':
print("=" * 50)
print("Vosk 语音识别服务")
print("=" * 50)
print("")
# 初始化模型
if init_vosk_model():
print("")
print("=" * 50)
print("服务启动成功!")
print("访问地址: http://localhost:5000")
print("健康检查: http://localhost:5000/api/speech/health")
print("=" * 50)
print("")
# 启动服务
app.run(host='0.0.0.0', port=5000, debug=False)
else:
print("")
print("=" * 50)
print("服务启动失败!请检查模型文件")
print("=" * 50)
input("按回车键退出...")