botmodels/src/api/v1/endpoints/speech.py
Rodrigo Rodriguez (Pragmatismo) 5a43dc81c7 Rewrite BotModels as FastAPI multimodal AI service
Replace Azure Functions architecture with a modern FastAPI-based REST
API providing image, video, speech, and vision capabilities for General
Bots.

Key changes:
- Add FastAPI app with versioned API endpoints and OpenAPI docs
- Implement services for Stable Diffusion, Zeroscope, TTS/Whisper, BLIP2
- Add pydantic schemas for request/response validation
- Configure structured logging with structlog
- Support lazy model loading and GPU acceleration
- Update dependencies from Azure/TensorFlow stack to PyTorch/diffusers
2025-11-30 07:52:56 -03:00

85 lines
2.4 KiB
Python

from fastapi import APIRouter, Depends, File, UploadFile
from ....schemas.generation import (
GenerationResponse,
SpeechGenerateRequest,
SpeechToTextResponse,
)
from ....services.speech_service import get_speech_service
from ...dependencies import verify_api_key
router = APIRouter(prefix="/speech", tags=["Speech"])
@router.post("/generate", response_model=GenerationResponse)
async def generate_speech(
request: SpeechGenerateRequest,
api_key: str = Depends(verify_api_key),
service=Depends(get_speech_service),
):
"""
Generate speech audio from text (Text-to-Speech).
Args:
request: Speech generation parameters including:
- prompt: Text to convert to speech
- voice: Voice model to use (optional, default: "default")
- language: Language code (optional, default: "en")
api_key: API key for authentication
service: Speech service instance
Returns:
GenerationResponse with file path to generated audio and generation time
"""
result = await service.generate(
prompt=request.prompt,
voice=request.voice,
language=request.language,
)
return GenerationResponse(**result)
@router.post("/totext", response_model=SpeechToTextResponse)
async def speech_to_text(
file: UploadFile = File(...),
api_key: str = Depends(verify_api_key),
service=Depends(get_speech_service),
):
"""
Convert speech audio to text (Speech-to-Text) using Whisper.
Supported audio formats: wav, mp3, m4a, flac, ogg
Args:
file: Audio file to transcribe
api_key: API key for authentication
service: Speech service instance
Returns:
SpeechToTextResponse with transcribed text, detected language, and confidence
"""
audio_data = await file.read()
result = await service.to_text(audio_data)
return SpeechToTextResponse(**result)
@router.post("/detect_language")
async def detect_language(
file: UploadFile = File(...),
api_key: str = Depends(verify_api_key),
service=Depends(get_speech_service),
):
"""
Detect the language of spoken audio using Whisper.
Args:
file: Audio file to analyze
api_key: API key for authentication
service: Speech service instance
Returns:
dict with detected language code and confidence score
"""
audio_data = await file.read()
result = await service.detect_language(audio_data)
return result