botmodels/src/services/speech_service.py

230 lines
7 KiB
Python
Raw Normal View History

import io
import tempfile
import time
from datetime import datetime
from pathlib import Path
from typing import Optional
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger("speech_service")
class SpeechService:
def __init__(self):
self.tts_model = None
self.whisper_model = None
self.device = settings.device
self._initialized = False
def initialize(self):
if self._initialized:
return
logger.info("Loading speech models")
try:
# Load TTS model (Coqui TTS)
self._load_tts_model()
# Load Whisper model for speech-to-text
self._load_whisper_model()
self._initialized = True
logger.info("Speech models loaded successfully")
except Exception as e:
logger.error("Failed to load speech models", error=str(e))
# Don't raise - allow service to run with partial functionality
logger.warning("Speech service will have limited functionality")
def _load_tts_model(self):
"""Load TTS model for text-to-speech generation"""
try:
from TTS.api import TTS
# Use a fast, high-quality model
self.tts_model = TTS(
model_name="tts_models/en/ljspeech/tacotron2-DDC",
progress_bar=False,
gpu=(self.device == "cuda"),
)
logger.info("TTS model loaded")
except Exception as e:
logger.warning("TTS model not available", error=str(e))
self.tts_model = None
def _load_whisper_model(self):
"""Load Whisper model for speech-to-text"""
try:
import whisper
# Use base model for balance of speed and accuracy
model_size = "base"
if Path(settings.whisper_model_path).exists():
self.whisper_model = whisper.load_model(
model_size, download_root=settings.whisper_model_path
)
else:
self.whisper_model = whisper.load_model(model_size)
logger.info("Whisper model loaded", model=model_size)
except Exception as e:
logger.warning("Whisper model not available", error=str(e))
self.whisper_model = None
async def generate(
self,
prompt: str,
voice: Optional[str] = None,
language: Optional[str] = None,
) -> dict:
"""Generate speech audio from text"""
if not self._initialized:
self.initialize()
start = time.time()
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"{timestamp}_{hash(prompt) & 0xFFFFFF:06x}.wav"
output_path = settings.output_dir / "audio" / filename
if self.tts_model is None:
logger.error("TTS model not available")
return {
"status": "error",
"error": "TTS model not initialized",
"file_path": None,
"generation_time": time.time() - start,
}
try:
logger.info(
"Generating speech",
text_length=len(prompt),
voice=voice,
language=language,
)
# Generate speech
self.tts_model.tts_to_file(
text=prompt,
file_path=str(output_path),
)
generation_time = time.time() - start
logger.info("Speech generated", file=filename, time=generation_time)
return {
"status": "completed",
"file_path": f"/outputs/audio/{filename}",
"generation_time": generation_time,
}
except Exception as e:
logger.error("Speech generation failed", error=str(e))
return {
"status": "error",
"error": str(e),
"file_path": None,
"generation_time": time.time() - start,
}
async def to_text(self, audio_data: bytes) -> dict:
"""Convert speech audio to text using Whisper"""
if not self._initialized:
self.initialize()
start = time.time()
if self.whisper_model is None:
logger.error("Whisper model not available")
return {
"text": "",
"language": None,
"confidence": 0.0,
"error": "Whisper model not initialized",
}
try:
# Save audio to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(audio_data)
tmp_path = tmp.name
logger.info("Transcribing audio", file_size=len(audio_data))
# Transcribe
result = self.whisper_model.transcribe(tmp_path)
# Clean up temp file
import os
os.unlink(tmp_path)
transcription_time = time.time() - start
logger.info(
"Audio transcribed",
text_length=len(result["text"]),
language=result.get("language"),
time=transcription_time,
)
return {
"text": result["text"].strip(),
"language": result.get("language", "en"),
"confidence": 0.95, # Whisper doesn't provide confidence directly
}
except Exception as e:
logger.error("Speech-to-text failed", error=str(e))
return {
"text": "",
"language": None,
"confidence": 0.0,
"error": str(e),
}
async def detect_language(self, audio_data: bytes) -> dict:
"""Detect the language of spoken audio"""
if not self._initialized:
self.initialize()
if self.whisper_model is None:
return {"language": None, "error": "Whisper model not initialized"}
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(audio_data)
tmp_path = tmp.name
import whisper
# Load audio and detect language
audio = whisper.load_audio(tmp_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(self.whisper_model.device)
_, probs = self.whisper_model.detect_language(mel)
import os
os.unlink(tmp_path)
detected_lang = max(probs, key=probs.get)
confidence = probs[detected_lang]
return {
"language": detected_lang,
"confidence": confidence,
}
except Exception as e:
logger.error("Language detection failed", error=str(e))
return {"language": None, "error": str(e)}
_service = None
def get_speech_service():
global _service
if _service is None:
_service = SpeechService()
return _service