botmodels/src/services/speech_service.py
Rodrigo Rodriguez (Pragmatismo) 5a43dc81c7 Rewrite BotModels as FastAPI multimodal AI service
Replace Azure Functions architecture with a modern FastAPI-based REST
API providing image, video, speech, and vision capabilities for General
Bots.

Key changes:
- Add FastAPI app with versioned API endpoints and OpenAPI docs
- Implement services for Stable Diffusion, Zeroscope, TTS/Whisper, BLIP2
- Add pydantic schemas for request/response validation
- Configure structured logging with structlog
- Support lazy model loading and GPU acceleration
- Update dependencies from Azure/TensorFlow stack to PyTorch/diffusers
2025-11-30 07:52:56 -03:00

229 lines
7 KiB
Python

import io
import tempfile
import time
from datetime import datetime
from pathlib import Path
from typing import Optional
from ..core.config import settings
from ..core.logging import get_logger
logger = get_logger("speech_service")
class SpeechService:
def __init__(self):
self.tts_model = None
self.whisper_model = None
self.device = settings.device
self._initialized = False
def initialize(self):
if self._initialized:
return
logger.info("Loading speech models")
try:
# Load TTS model (Coqui TTS)
self._load_tts_model()
# Load Whisper model for speech-to-text
self._load_whisper_model()
self._initialized = True
logger.info("Speech models loaded successfully")
except Exception as e:
logger.error("Failed to load speech models", error=str(e))
# Don't raise - allow service to run with partial functionality
logger.warning("Speech service will have limited functionality")
def _load_tts_model(self):
"""Load TTS model for text-to-speech generation"""
try:
from TTS.api import TTS
# Use a fast, high-quality model
self.tts_model = TTS(
model_name="tts_models/en/ljspeech/tacotron2-DDC",
progress_bar=False,
gpu=(self.device == "cuda"),
)
logger.info("TTS model loaded")
except Exception as e:
logger.warning("TTS model not available", error=str(e))
self.tts_model = None
def _load_whisper_model(self):
"""Load Whisper model for speech-to-text"""
try:
import whisper
# Use base model for balance of speed and accuracy
model_size = "base"
if Path(settings.whisper_model_path).exists():
self.whisper_model = whisper.load_model(
model_size, download_root=settings.whisper_model_path
)
else:
self.whisper_model = whisper.load_model(model_size)
logger.info("Whisper model loaded", model=model_size)
except Exception as e:
logger.warning("Whisper model not available", error=str(e))
self.whisper_model = None
async def generate(
self,
prompt: str,
voice: Optional[str] = None,
language: Optional[str] = None,
) -> dict:
"""Generate speech audio from text"""
if not self._initialized:
self.initialize()
start = time.time()
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"{timestamp}_{hash(prompt) & 0xFFFFFF:06x}.wav"
output_path = settings.output_dir / "audio" / filename
if self.tts_model is None:
logger.error("TTS model not available")
return {
"status": "error",
"error": "TTS model not initialized",
"file_path": None,
"generation_time": time.time() - start,
}
try:
logger.info(
"Generating speech",
text_length=len(prompt),
voice=voice,
language=language,
)
# Generate speech
self.tts_model.tts_to_file(
text=prompt,
file_path=str(output_path),
)
generation_time = time.time() - start
logger.info("Speech generated", file=filename, time=generation_time)
return {
"status": "completed",
"file_path": f"/outputs/audio/{filename}",
"generation_time": generation_time,
}
except Exception as e:
logger.error("Speech generation failed", error=str(e))
return {
"status": "error",
"error": str(e),
"file_path": None,
"generation_time": time.time() - start,
}
async def to_text(self, audio_data: bytes) -> dict:
"""Convert speech audio to text using Whisper"""
if not self._initialized:
self.initialize()
start = time.time()
if self.whisper_model is None:
logger.error("Whisper model not available")
return {
"text": "",
"language": None,
"confidence": 0.0,
"error": "Whisper model not initialized",
}
try:
# Save audio to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(audio_data)
tmp_path = tmp.name
logger.info("Transcribing audio", file_size=len(audio_data))
# Transcribe
result = self.whisper_model.transcribe(tmp_path)
# Clean up temp file
import os
os.unlink(tmp_path)
transcription_time = time.time() - start
logger.info(
"Audio transcribed",
text_length=len(result["text"]),
language=result.get("language"),
time=transcription_time,
)
return {
"text": result["text"].strip(),
"language": result.get("language", "en"),
"confidence": 0.95, # Whisper doesn't provide confidence directly
}
except Exception as e:
logger.error("Speech-to-text failed", error=str(e))
return {
"text": "",
"language": None,
"confidence": 0.0,
"error": str(e),
}
async def detect_language(self, audio_data: bytes) -> dict:
"""Detect the language of spoken audio"""
if not self._initialized:
self.initialize()
if self.whisper_model is None:
return {"language": None, "error": "Whisper model not initialized"}
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(audio_data)
tmp_path = tmp.name
import whisper
# Load audio and detect language
audio = whisper.load_audio(tmp_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(self.whisper_model.device)
_, probs = self.whisper_model.detect_language(mel)
import os
os.unlink(tmp_path)
detected_lang = max(probs, key=probs.get)
confidence = probs[detected_lang]
return {
"language": detected_lang,
"confidence": confidence,
}
except Exception as e:
logger.error("Language detection failed", error=str(e))
return {"language": None, "error": str(e)}
_service = None
def get_speech_service():
global _service
if _service is None:
_service = SpeechService()
return _service