From a21292daa30a35d2a1fc3792f7b8b9d2d6aa7392 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Sat, 29 Nov 2025 20:40:08 -0300 Subject: [PATCH] Add multimodal module for botmodels integration Introduces IMAGE, VIDEO, AUDIO, and SEE keywords for BASIC scripts that connect to the botmodels service for AI-powered media generation and vision/captioning capabilities. - Add BotModelsClient for HTTP communication with botmodels service - Implement BASIC keywords: IMAGE, VIDEO, AUDIO (generation), SEE (captioning) - Support configuration via config.csv for models --- docs/multimodal-config.md | 191 +++++ src/basic/keywords/mod.rs | 1 + src/basic/keywords/multimodal.rs | 323 +++++++++ src/basic/mod.rs | 10 + src/lib.rs | 1 + src/multimodal/mod.rs | 652 ++++++++++++++++++ .../default.gbai/default.gbot/config.csv | 22 + 7 files changed, 1200 insertions(+) create mode 100644 docs/multimodal-config.md create mode 100644 src/basic/keywords/multimodal.rs create mode 100644 src/multimodal/mod.rs diff --git a/docs/multimodal-config.md b/docs/multimodal-config.md new file mode 100644 index 00000000..a95cc6b9 --- /dev/null +++ b/docs/multimodal-config.md @@ -0,0 +1,191 @@ +# Multimodal Configuration Guide + +This document describes how to configure botserver to use the botmodels service for image, video, audio generation, and vision/captioning capabilities. + +## Overview + +The multimodal feature connects botserver to botmodels - a Python-based service similar to llama.cpp but for multimodal AI tasks. This enables BASIC scripts to generate images, videos, audio, and analyze visual content. + +## Configuration Keys + +Add the following configuration to your bot's `config.csv` file: + +### Image Generator Settings + +| Key | Default | Description | +|-----|---------|-------------| +| `image-generator-model` | - | Path to the image generation model (e.g., `../../../../data/diffusion/sd_turbo_f16.gguf`) | +| `image-generator-steps` | `4` | Number of inference steps for image generation | +| `image-generator-width` | `512` | Output image width in pixels | +| `image-generator-height` | `512` | Output image height in pixels | +| `image-generator-gpu-layers` | `20` | Number of layers to offload to GPU | +| `image-generator-batch-size` | `1` | Batch size for generation | + +### Video Generator Settings + +| Key | Default | Description | +|-----|---------|-------------| +| `video-generator-model` | - | Path to the video generation model (e.g., `../../../../data/diffusion/zeroscope_v2_576w`) | +| `video-generator-frames` | `24` | Number of frames to generate | +| `video-generator-fps` | `8` | Frames per second for output video | +| `video-generator-width` | `320` | Output video width in pixels | +| `video-generator-height` | `576` | Output video height in pixels | +| `video-generator-gpu-layers` | `15` | Number of layers to offload to GPU | +| `video-generator-batch-size` | `1` | Batch size for generation | + +### BotModels Service Settings + +| Key | Default | Description | +|-----|---------|-------------| +| `botmodels-enabled` | `false` | Enable/disable botmodels integration | +| `botmodels-host` | `0.0.0.0` | Host address for botmodels service | +| `botmodels-port` | `8085` | Port for botmodels service | +| `botmodels-api-key` | - | API key for authentication with botmodels | +| `botmodels-https` | `false` | Use HTTPS for connection to botmodels | + +## Example config.csv + +```csv +key,value +image-generator-model,../../../../data/diffusion/sd_turbo_f16.gguf +image-generator-steps,4 +image-generator-width,512 +image-generator-height,512 +image-generator-gpu-layers,20 +image-generator-batch-size,1 +video-generator-model,../../../../data/diffusion/zeroscope_v2_576w +video-generator-frames,24 +video-generator-fps,8 +video-generator-width,320 +video-generator-height,576 +video-generator-gpu-layers,15 +video-generator-batch-size,1 +botmodels-enabled,true +botmodels-host,0.0.0.0 +botmodels-port,8085 +botmodels-api-key,your-secret-key +botmodels-https,false +``` + +## BASIC Keywords + +Once configured, the following keywords become available in BASIC scripts: + +### IMAGE + +Generate an image from a text prompt. + +```basic +file = IMAGE "a cute cat playing with yarn" +SEND FILE TO user, file +``` + +### VIDEO + +Generate a video from a text prompt. + +```basic +file = VIDEO "a rocket launching into space" +SEND FILE TO user, file +``` + +### AUDIO + +Generate speech audio from text. + +```basic +file = AUDIO "Hello, welcome to our service!" +SEND FILE TO user, file +``` + +### SEE + +Get a caption/description of an image or video file. + +```basic +caption = SEE "/path/to/image.jpg" +TALK caption + +// Also works with video files +description = SEE "/path/to/video.mp4" +TALK description +``` + +## Starting BotModels Service + +Before using multimodal features, start the botmodels service: + +```bash +cd botmodels +python -m uvicorn src.main:app --host 0.0.0.0 --port 8085 +``` + +Or with HTTPS: + +```bash +python -m uvicorn src.main:app --host 0.0.0.0 --port 8085 --ssl-keyfile key.pem --ssl-certfile cert.pem +``` + +## API Endpoints (BotModels) + +The botmodels service exposes these REST endpoints: + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/api/image/generate` | POST | Generate image from prompt | +| `/api/video/generate` | POST | Generate video from prompt | +| `/api/speech/generate` | POST | Generate speech from text | +| `/api/speech/totext` | POST | Convert audio to text | +| `/api/vision/describe` | POST | Get description of an image | +| `/api/vision/describe_video` | POST | Get description of a video | +| `/api/vision/vqa` | POST | Visual question answering | +| `/api/health` | GET | Health check | + +All endpoints require the `X-API-Key` header for authentication. + +## Architecture + +``` +┌─────────────┐ HTTPS ┌─────────────┐ +│ botserver │ ────────────▶ │ botmodels │ +│ (Rust) │ │ (Python) │ +└─────────────┘ └─────────────┘ + │ │ + │ BASIC Keywords │ AI Models + │ - IMAGE │ - Stable Diffusion + │ - VIDEO │ - Zeroscope + │ - AUDIO │ - TTS/Whisper + │ - SEE │ - BLIP2 + ▼ ▼ +┌─────────────┐ ┌─────────────┐ +│ config │ │ outputs │ +│ .csv │ │ (files) │ +└─────────────┘ └─────────────┘ +``` + +## Troubleshooting + +### "BotModels is not enabled" + +Set `botmodels-enabled=true` in your config.csv. + +### Connection refused + +1. Ensure botmodels service is running +2. Check host/port configuration +3. Verify firewall settings + +### Authentication failed + +Ensure `botmodels-api-key` in config.csv matches `API_KEY` environment variable in botmodels. + +### Model not found + +Verify model paths are correct and models are downloaded to the expected locations. + +## Security Notes + +1. Always use HTTPS in production (`botmodels-https=true`) +2. Use strong, unique API keys +3. Restrict network access to botmodels service +4. Consider running botmodels on a separate GPU server \ No newline at end of file diff --git a/src/basic/keywords/mod.rs b/src/basic/keywords/mod.rs index eba23d68..662855d7 100644 --- a/src/basic/keywords/mod.rs +++ b/src/basic/keywords/mod.rs @@ -15,6 +15,7 @@ pub mod get; pub mod hear_talk; pub mod last; pub mod llm_keyword; +pub mod multimodal; pub mod on; pub mod print; pub mod remember; diff --git a/src/basic/keywords/multimodal.rs b/src/basic/keywords/multimodal.rs new file mode 100644 index 00000000..9ba5cb9d --- /dev/null +++ b/src/basic/keywords/multimodal.rs @@ -0,0 +1,323 @@ +//! Multimodal keywords for image, video, audio generation and vision/captioning +//! +//! Provides BASIC keywords: +//! - IMAGE "prompt" -> generates image, returns file URL +//! - VIDEO "prompt" -> generates video, returns file URL +//! - AUDIO "text" -> generates speech audio, returns file URL +//! - SEE file -> gets caption/description of image or video + +use crate::multimodal::BotModelsClient; +use crate::shared::models::UserSession; +use crate::shared::state::AppState; +use log::{error, trace}; +use rhai::{Dynamic, Engine}; +use std::sync::Arc; +use std::time::Duration; + +/// Register all multimodal keywords +pub fn register_multimodal_keywords(state: Arc, user: UserSession, engine: &mut Engine) { + image_keyword(state.clone(), user.clone(), engine); + video_keyword(state.clone(), user.clone(), engine); + audio_keyword(state.clone(), user.clone(), engine); + see_keyword(state.clone(), user.clone(), engine); +} + +/// IMAGE "prompt" - Generate an image from text prompt +/// Returns the URL/path to the generated image file +pub fn image_keyword(state: Arc, user: UserSession, engine: &mut Engine) { + let state_clone = Arc::clone(&state); + let user_clone = user.clone(); + + engine + .register_custom_syntax(&["IMAGE", "$expr$"], false, move |context, inputs| { + let prompt = context.eval_expression_tree(&inputs[0])?.to_string(); + + trace!("IMAGE keyword: generating image for prompt: {}", prompt); + + let state_for_thread = Arc::clone(&state_clone); + let bot_id = user_clone.bot_id; + + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build(); + + let send_err = if let Ok(rt) = rt { + let result = rt.block_on(async move { + execute_image_generation(state_for_thread, bot_id, prompt).await + }); + tx.send(result).err() + } else { + tx.send(Err("Failed to build tokio runtime".into())).err() + }; + + if send_err.is_some() { + error!("Failed to send IMAGE result"); + } + }); + + match rx.recv_timeout(Duration::from_secs(300)) { + Ok(Ok(result)) => Ok(Dynamic::from(result)), + Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + e.to_string().into(), + rhai::Position::NONE, + ))), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + "Image generation timed out".into(), + rhai::Position::NONE, + ))) + } + Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + format!("IMAGE thread failed: {}", e).into(), + rhai::Position::NONE, + ))), + } + }) + .unwrap(); +} + +async fn execute_image_generation( + state: Arc, + bot_id: uuid::Uuid, + prompt: String, +) -> Result> { + let client = BotModelsClient::from_state(&state, &bot_id); + + if !client.is_enabled() { + return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into()); + } + + client.generate_image(&prompt).await +} + +/// VIDEO "prompt" - Generate a video from text prompt +/// Returns the URL/path to the generated video file +pub fn video_keyword(state: Arc, user: UserSession, engine: &mut Engine) { + let state_clone = Arc::clone(&state); + let user_clone = user.clone(); + + engine + .register_custom_syntax(&["VIDEO", "$expr$"], false, move |context, inputs| { + let prompt = context.eval_expression_tree(&inputs[0])?.to_string(); + + trace!("VIDEO keyword: generating video for prompt: {}", prompt); + + let state_for_thread = Arc::clone(&state_clone); + let bot_id = user_clone.bot_id; + + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build(); + + let send_err = if let Ok(rt) = rt { + let result = rt.block_on(async move { + execute_video_generation(state_for_thread, bot_id, prompt).await + }); + tx.send(result).err() + } else { + tx.send(Err("Failed to build tokio runtime".into())).err() + }; + + if send_err.is_some() { + error!("Failed to send VIDEO result"); + } + }); + + // Video generation can take longer + match rx.recv_timeout(Duration::from_secs(600)) { + Ok(Ok(result)) => Ok(Dynamic::from(result)), + Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + e.to_string().into(), + rhai::Position::NONE, + ))), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + "Video generation timed out".into(), + rhai::Position::NONE, + ))) + } + Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + format!("VIDEO thread failed: {}", e).into(), + rhai::Position::NONE, + ))), + } + }) + .unwrap(); +} + +async fn execute_video_generation( + state: Arc, + bot_id: uuid::Uuid, + prompt: String, +) -> Result> { + let client = BotModelsClient::from_state(&state, &bot_id); + + if !client.is_enabled() { + return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into()); + } + + client.generate_video(&prompt).await +} + +/// AUDIO "text" - Generate speech audio from text +/// Returns the URL/path to the generated audio file +pub fn audio_keyword(state: Arc, user: UserSession, engine: &mut Engine) { + let state_clone = Arc::clone(&state); + let user_clone = user.clone(); + + engine + .register_custom_syntax(&["AUDIO", "$expr$"], false, move |context, inputs| { + let text = context.eval_expression_tree(&inputs[0])?.to_string(); + + trace!("AUDIO keyword: generating speech for text: {}", text); + + let state_for_thread = Arc::clone(&state_clone); + let bot_id = user_clone.bot_id; + + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build(); + + let send_err = if let Ok(rt) = rt { + let result = rt.block_on(async move { + execute_audio_generation(state_for_thread, bot_id, text).await + }); + tx.send(result).err() + } else { + tx.send(Err("Failed to build tokio runtime".into())).err() + }; + + if send_err.is_some() { + error!("Failed to send AUDIO result"); + } + }); + + match rx.recv_timeout(Duration::from_secs(120)) { + Ok(Ok(result)) => Ok(Dynamic::from(result)), + Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + e.to_string().into(), + rhai::Position::NONE, + ))), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + "Audio generation timed out".into(), + rhai::Position::NONE, + ))) + } + Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + format!("AUDIO thread failed: {}", e).into(), + rhai::Position::NONE, + ))), + } + }) + .unwrap(); +} + +async fn execute_audio_generation( + state: Arc, + bot_id: uuid::Uuid, + text: String, +) -> Result> { + let client = BotModelsClient::from_state(&state, &bot_id); + + if !client.is_enabled() { + return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into()); + } + + client.generate_audio(&text, None, None).await +} + +/// SEE file - Get caption/description of an image or video file +/// Returns the text description of the visual content +pub fn see_keyword(state: Arc, user: UserSession, engine: &mut Engine) { + let state_clone = Arc::clone(&state); + let user_clone = user.clone(); + + engine + .register_custom_syntax(&["SEE", "$expr$"], false, move |context, inputs| { + let file_path = context.eval_expression_tree(&inputs[0])?.to_string(); + + trace!("SEE keyword: getting caption for file: {}", file_path); + + let state_for_thread = Arc::clone(&state_clone); + let bot_id = user_clone.bot_id; + + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build(); + + let send_err = if let Ok(rt) = rt { + let result = rt.block_on(async move { + execute_see_caption(state_for_thread, bot_id, file_path).await + }); + tx.send(result).err() + } else { + tx.send(Err("Failed to build tokio runtime".into())).err() + }; + + if send_err.is_some() { + error!("Failed to send SEE result"); + } + }); + + match rx.recv_timeout(Duration::from_secs(60)) { + Ok(Ok(result)) => Ok(Dynamic::from(result)), + Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + e.to_string().into(), + rhai::Position::NONE, + ))), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + "Vision/caption timed out".into(), + rhai::Position::NONE, + ))) + } + Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime( + format!("SEE thread failed: {}", e).into(), + rhai::Position::NONE, + ))), + } + }) + .unwrap(); +} + +async fn execute_see_caption( + state: Arc, + bot_id: uuid::Uuid, + file_path: String, +) -> Result> { + let client = BotModelsClient::from_state(&state, &bot_id); + + if !client.is_enabled() { + return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into()); + } + + // Determine if it's a video or image based on extension + let lower_path = file_path.to_lowercase(); + if lower_path.ends_with(".mp4") + || lower_path.ends_with(".avi") + || lower_path.ends_with(".mov") + || lower_path.ends_with(".webm") + || lower_path.ends_with(".mkv") + { + client.describe_video(&file_path).await + } else { + client.describe_image(&file_path).await + } +} diff --git a/src/basic/mod.rs b/src/basic/mod.rs index 776467a3..24908ace 100644 --- a/src/basic/mod.rs +++ b/src/basic/mod.rs @@ -23,6 +23,7 @@ use self::keywords::format::format_keyword; use self::keywords::get::get_keyword; use self::keywords::hear_talk::{hear_keyword, talk_keyword}; use self::keywords::last::last_keyword; +use self::keywords::multimodal::register_multimodal_keywords; use self::keywords::remember::remember_keyword; use self::keywords::save_from_unstructured::save_from_unstructured_keyword; use self::keywords::send_mail::send_mail_keyword; @@ -92,6 +93,10 @@ impl ScriptService { &mut engine, ); + // Register multimodal keywords (IMAGE, VIDEO, AUDIO, SEE) + // These connect to botmodels for image/video/audio generation and vision/captioning + register_multimodal_keywords(state.clone(), user.clone(), &mut engine); + ScriptService { engine } } fn preprocess_basic_script(&self, script: &str) -> String { @@ -158,6 +163,11 @@ impl ScriptService { "SET USER", "GET BOT MEMORY", "SET BOT MEMORY", + "IMAGE", + "VIDEO", + "AUDIO", + "SEE", + "SEND FILE", ]; let is_basic_command = basic_commands.iter().any(|&cmd| trimmed.starts_with(cmd)); let is_control_flow = trimmed.starts_with("IF") diff --git a/src/lib.rs b/src/lib.rs index 30f1140b..81f8f660 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ // Core modules (always included) pub mod basic; pub mod core; +pub mod multimodal; pub mod security; pub mod web; diff --git a/src/multimodal/mod.rs b/src/multimodal/mod.rs new file mode 100644 index 00000000..4a4cce83 --- /dev/null +++ b/src/multimodal/mod.rs @@ -0,0 +1,652 @@ +//! Multimodal module for botmodels integration +//! Provides client for image, video, audio generation and vision/captioning + +use crate::config::ConfigManager; +use crate::shared::state::AppState; +use log::{error, info, trace}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use uuid::Uuid; + +/// Configuration for botmodels connection +#[derive(Debug, Clone)] +pub struct BotModelsConfig { + pub enabled: bool, + pub host: String, + pub port: u16, + pub api_key: String, + pub use_https: bool, +} + +impl BotModelsConfig { + pub fn from_database(config_manager: &ConfigManager, bot_id: &Uuid) -> Self { + let enabled = config_manager + .get_config(bot_id, "botmodels-enabled", Some("false")) + .unwrap_or_default() + .to_lowercase() + == "true"; + + let host = config_manager + .get_config(bot_id, "botmodels-host", Some("0.0.0.0")) + .unwrap_or_else(|_| "0.0.0.0".to_string()); + + let port = config_manager + .get_config(bot_id, "botmodels-port", Some("8085")) + .unwrap_or_else(|_| "8085".to_string()) + .parse() + .unwrap_or(8085); + + let api_key = config_manager + .get_config(bot_id, "botmodels-api-key", Some("")) + .unwrap_or_default(); + + let use_https = config_manager + .get_config(bot_id, "botmodels-https", Some("false")) + .unwrap_or_default() + .to_lowercase() + == "true"; + + Self { + enabled, + host, + port, + api_key, + use_https, + } + } + + pub fn base_url(&self) -> String { + let protocol = if self.use_https { "https" } else { "http" }; + format!("{}://{}:{}", protocol, self.host, self.port) + } +} + +/// Image generation configuration +#[derive(Debug, Clone)] +pub struct ImageGeneratorConfig { + pub model: String, + pub steps: u32, + pub width: u32, + pub height: u32, + pub gpu_layers: u32, + pub batch_size: u32, +} + +impl ImageGeneratorConfig { + pub fn from_database(config_manager: &ConfigManager, bot_id: &Uuid) -> Self { + Self { + model: config_manager + .get_config(bot_id, "image-generator-model", None) + .unwrap_or_default(), + steps: config_manager + .get_config(bot_id, "image-generator-steps", Some("4")) + .unwrap_or_else(|_| "4".to_string()) + .parse() + .unwrap_or(4), + width: config_manager + .get_config(bot_id, "image-generator-width", Some("512")) + .unwrap_or_else(|_| "512".to_string()) + .parse() + .unwrap_or(512), + height: config_manager + .get_config(bot_id, "image-generator-height", Some("512")) + .unwrap_or_else(|_| "512".to_string()) + .parse() + .unwrap_or(512), + gpu_layers: config_manager + .get_config(bot_id, "image-generator-gpu-layers", Some("20")) + .unwrap_or_else(|_| "20".to_string()) + .parse() + .unwrap_or(20), + batch_size: config_manager + .get_config(bot_id, "image-generator-batch-size", Some("1")) + .unwrap_or_else(|_| "1".to_string()) + .parse() + .unwrap_or(1), + } + } +} + +/// Video generation configuration +#[derive(Debug, Clone)] +pub struct VideoGeneratorConfig { + pub model: String, + pub frames: u32, + pub fps: u32, + pub width: u32, + pub height: u32, + pub gpu_layers: u32, + pub batch_size: u32, +} + +impl VideoGeneratorConfig { + pub fn from_database(config_manager: &ConfigManager, bot_id: &Uuid) -> Self { + Self { + model: config_manager + .get_config(bot_id, "video-generator-model", None) + .unwrap_or_default(), + frames: config_manager + .get_config(bot_id, "video-generator-frames", Some("24")) + .unwrap_or_else(|_| "24".to_string()) + .parse() + .unwrap_or(24), + fps: config_manager + .get_config(bot_id, "video-generator-fps", Some("8")) + .unwrap_or_else(|_| "8".to_string()) + .parse() + .unwrap_or(8), + width: config_manager + .get_config(bot_id, "video-generator-width", Some("320")) + .unwrap_or_else(|_| "320".to_string()) + .parse() + .unwrap_or(320), + height: config_manager + .get_config(bot_id, "video-generator-height", Some("576")) + .unwrap_or_else(|_| "576".to_string()) + .parse() + .unwrap_or(576), + gpu_layers: config_manager + .get_config(bot_id, "video-generator-gpu-layers", Some("15")) + .unwrap_or_else(|_| "15".to_string()) + .parse() + .unwrap_or(15), + batch_size: config_manager + .get_config(bot_id, "video-generator-batch-size", Some("1")) + .unwrap_or_else(|_| "1".to_string()) + .parse() + .unwrap_or(1), + } + } +} + +// API Request/Response types + +#[derive(Debug, Serialize)] +pub struct ImageGenerateRequest { + pub prompt: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub steps: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub width: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub height: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub guidance_scale: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub seed: Option, +} + +#[derive(Debug, Serialize)] +pub struct VideoGenerateRequest { + pub prompt: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub num_frames: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub fps: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub steps: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub seed: Option, +} + +#[derive(Debug, Serialize)] +pub struct SpeechGenerateRequest { + pub prompt: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub voice: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub language: Option, +} + +#[derive(Debug, Deserialize)] +pub struct GenerationResponse { + pub status: String, + pub file_path: Option, + pub generation_time: Option, + pub error: Option, +} + +#[derive(Debug, Deserialize)] +pub struct DescribeResponse { + pub description: String, + pub confidence: Option, +} + +#[derive(Debug, Deserialize)] +pub struct VideoDescribeResponse { + pub description: String, + pub frame_count: Option, +} + +#[derive(Debug, Deserialize)] +pub struct SpeechToTextResponse { + pub text: String, + pub language: Option, + pub confidence: Option, +} + +/// BotModels client for multimodal operations +pub struct BotModelsClient { + client: Client, + config: BotModelsConfig, + image_config: ImageGeneratorConfig, + video_config: VideoGeneratorConfig, +} + +impl BotModelsClient { + pub fn new( + config: BotModelsConfig, + image_config: ImageGeneratorConfig, + video_config: VideoGeneratorConfig, + ) -> Self { + let client = Client::builder() + .danger_accept_invalid_certs(true) // For self-signed certs in dev + .timeout(std::time::Duration::from_secs(300)) // 5 min timeout for generation + .build() + .unwrap_or_else(|_| Client::new()); + + Self { + client, + config, + image_config, + video_config, + } + } + + pub fn from_state(state: &AppState, bot_id: &Uuid) -> Self { + let config_manager = ConfigManager::new(state.conn.clone()); + let config = BotModelsConfig::from_database(&config_manager, bot_id); + let image_config = ImageGeneratorConfig::from_database(&config_manager, bot_id); + let video_config = VideoGeneratorConfig::from_database(&config_manager, bot_id); + Self::new(config, image_config, video_config) + } + + pub fn is_enabled(&self) -> bool { + self.config.enabled + } + + /// Generate an image from a text prompt + pub async fn generate_image( + &self, + prompt: &str, + ) -> Result> { + if !self.config.enabled { + return Err("BotModels is not enabled".into()); + } + + let url = format!("{}/api/image/generate", self.config.base_url()); + trace!("Generating image at {}: {}", url, prompt); + + let request = ImageGenerateRequest { + prompt: prompt.to_string(), + steps: Some(self.image_config.steps), + width: Some(self.image_config.width), + height: Some(self.image_config.height), + guidance_scale: Some(7.5), + seed: None, + }; + + let response = self + .client + .post(&url) + .header("X-API-Key", &self.config.api_key) + .json(&request) + .send() + .await?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + error!("Image generation failed: {}", error_text); + return Err(format!("Image generation failed: {}", error_text).into()); + } + + let result: GenerationResponse = response.json().await?; + + if result.status == "completed" { + if let Some(file_path) = result.file_path { + let full_url = format!("{}{}", self.config.base_url(), file_path); + info!("Image generated: {}", full_url); + return Ok(full_url); + } + } + + Err(result + .error + .unwrap_or_else(|| "Unknown error".to_string()) + .into()) + } + + /// Generate a video from a text prompt + pub async fn generate_video( + &self, + prompt: &str, + ) -> Result> { + if !self.config.enabled { + return Err("BotModels is not enabled".into()); + } + + let url = format!("{}/api/video/generate", self.config.base_url()); + trace!("Generating video at {}: {}", url, prompt); + + let request = VideoGenerateRequest { + prompt: prompt.to_string(), + num_frames: Some(self.video_config.frames), + fps: Some(self.video_config.fps), + steps: Some(50), + seed: None, + }; + + let response = self + .client + .post(&url) + .header("X-API-Key", &self.config.api_key) + .json(&request) + .send() + .await?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + error!("Video generation failed: {}", error_text); + return Err(format!("Video generation failed: {}", error_text).into()); + } + + let result: GenerationResponse = response.json().await?; + + if result.status == "completed" { + if let Some(file_path) = result.file_path { + let full_url = format!("{}{}", self.config.base_url(), file_path); + info!("Video generated: {}", full_url); + return Ok(full_url); + } + } + + Err(result + .error + .unwrap_or_else(|| "Unknown error".to_string()) + .into()) + } + + /// Generate audio/speech from text + pub async fn generate_audio( + &self, + text: &str, + voice: Option<&str>, + language: Option<&str>, + ) -> Result> { + if !self.config.enabled { + return Err("BotModels is not enabled".into()); + } + + let url = format!("{}/api/speech/generate", self.config.base_url()); + trace!("Generating audio at {}: {}", url, text); + + let request = SpeechGenerateRequest { + prompt: text.to_string(), + voice: voice.map(String::from), + language: language.map(String::from), + }; + + let response = self + .client + .post(&url) + .header("X-API-Key", &self.config.api_key) + .json(&request) + .send() + .await?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + error!("Audio generation failed: {}", error_text); + return Err(format!("Audio generation failed: {}", error_text).into()); + } + + let result: GenerationResponse = response.json().await?; + + if result.status == "completed" { + if let Some(file_path) = result.file_path { + let full_url = format!("{}{}", self.config.base_url(), file_path); + info!("Audio generated: {}", full_url); + return Ok(full_url); + } + } + + Err(result + .error + .unwrap_or_else(|| "Unknown error".to_string()) + .into()) + } + + /// Get caption/description for an image + pub async fn describe_image( + &self, + image_url_or_path: &str, + ) -> Result> { + if !self.config.enabled { + return Err("BotModels is not enabled".into()); + } + + let url = format!("{}/api/vision/describe", self.config.base_url()); + trace!("Describing image at {}: {}", url, image_url_or_path); + + // If it's a URL, download the image first + let image_data = if image_url_or_path.starts_with("http") { + let response = self.client.get(image_url_or_path).send().await?; + response.bytes().await?.to_vec() + } else { + tokio::fs::read(image_url_or_path).await? + }; + + let form = reqwest::multipart::Form::new().part( + "file", + reqwest::multipart::Part::bytes(image_data) + .file_name("image.png") + .mime_str("image/png")?, + ); + + let response = self + .client + .post(&url) + .header("X-API-Key", &self.config.api_key) + .multipart(form) + .send() + .await?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + error!("Image description failed: {}", error_text); + return Err(format!("Image description failed: {}", error_text).into()); + } + + let result: DescribeResponse = response.json().await?; + info!("Image described: {}", result.description); + Ok(result.description) + } + + /// Get caption/description for a video + pub async fn describe_video( + &self, + video_url_or_path: &str, + ) -> Result> { + if !self.config.enabled { + return Err("BotModels is not enabled".into()); + } + + let url = format!("{}/api/vision/describe_video", self.config.base_url()); + trace!("Describing video at {}: {}", url, video_url_or_path); + + let video_data = if video_url_or_path.starts_with("http") { + let response = self.client.get(video_url_or_path).send().await?; + response.bytes().await?.to_vec() + } else { + tokio::fs::read(video_url_or_path).await? + }; + + let form = reqwest::multipart::Form::new().part( + "file", + reqwest::multipart::Part::bytes(video_data) + .file_name("video.mp4") + .mime_str("video/mp4")?, + ); + + let response = self + .client + .post(&url) + .header("X-API-Key", &self.config.api_key) + .multipart(form) + .send() + .await?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + error!("Video description failed: {}", error_text); + return Err(format!("Video description failed: {}", error_text).into()); + } + + let result: VideoDescribeResponse = response.json().await?; + info!("Video described: {}", result.description); + Ok(result.description) + } + + /// Convert speech to text + pub async fn speech_to_text( + &self, + audio_url_or_path: &str, + ) -> Result> { + if !self.config.enabled { + return Err("BotModels is not enabled".into()); + } + + let url = format!("{}/api/speech/totext", self.config.base_url()); + trace!( + "Converting speech to text at {}: {}", + url, + audio_url_or_path + ); + + let audio_data = if audio_url_or_path.starts_with("http") { + let response = self.client.get(audio_url_or_path).send().await?; + response.bytes().await?.to_vec() + } else { + tokio::fs::read(audio_url_or_path).await? + }; + + let form = reqwest::multipart::Form::new().part( + "file", + reqwest::multipart::Part::bytes(audio_data) + .file_name("audio.wav") + .mime_str("audio/wav")?, + ); + + let response = self + .client + .post(&url) + .header("X-API-Key", &self.config.api_key) + .multipart(form) + .send() + .await?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + error!("Speech to text failed: {}", error_text); + return Err(format!("Speech to text failed: {}", error_text).into()); + } + + let result: SpeechToTextResponse = response.json().await?; + info!("Speech converted: {}", result.text); + Ok(result.text) + } + + /// Check if botmodels server is healthy + pub async fn health_check(&self) -> bool { + if !self.config.enabled { + return false; + } + + let url = format!("{}/api/health", self.config.base_url()); + match self.client.get(&url).send().await { + Ok(response) => response.status().is_success(), + Err(_) => false, + } + } + + /// Download generated file to local path + pub async fn download_file( + &self, + url: &str, + local_path: &str, + ) -> Result<(), Box> { + let response = self.client.get(url).send().await?; + let bytes = response.bytes().await?; + tokio::fs::write(local_path, bytes).await?; + Ok(()) + } +} + +/// Ensure botmodels server is running (similar to llama.cpp startup) +pub async fn ensure_botmodels_running( + app_state: Arc, +) -> Result<(), Box> { + use crate::shared::models::schema::bots::dsl::*; + use diesel::prelude::*; + + let config_values = { + let conn_arc = app_state.conn.clone(); + let default_bot_id = tokio::task::spawn_blocking(move || { + let mut conn = conn_arc.get().unwrap(); + bots.filter(name.eq("default")) + .select(id) + .first::(&mut *conn) + .unwrap_or_else(|_| uuid::Uuid::nil()) + }) + .await?; + + let config_manager = ConfigManager::new(app_state.conn.clone()); + let config = BotModelsConfig::from_database(&config_manager, &default_bot_id); + config + }; + + if !config_values.enabled { + info!("BotModels is disabled, skipping startup"); + return Ok(()); + } + + info!("Checking BotModels server status..."); + info!(" Host: {}", config_values.host); + info!(" Port: {}", config_values.port); + + let client = BotModelsClient::new( + config_values.clone(), + ImageGeneratorConfig { + model: String::new(), + steps: 4, + width: 512, + height: 512, + gpu_layers: 20, + batch_size: 1, + }, + VideoGeneratorConfig { + model: String::new(), + frames: 24, + fps: 8, + width: 320, + height: 576, + gpu_layers: 15, + batch_size: 1, + }, + ); + + // Check if already running + if client.health_check().await { + info!("BotModels server is already running and healthy"); + return Ok(()); + } + + info!("BotModels server not responding, it should be started externally"); + info!( + "Start botmodels with: cd botmodels && python -m uvicorn src.main:app --host {} --port {}", + config_values.host, config_values.port + ); + + Ok(()) +} diff --git a/templates/default.gbai/default.gbot/config.csv b/templates/default.gbai/default.gbot/config.csv index 0d473420..135788d1 100644 --- a/templates/default.gbai/default.gbot/config.csv +++ b/templates/default.gbai/default.gbot/config.csv @@ -49,3 +49,25 @@ custom-password, website-expires,1d website-max-depth,3 website-max-pages,100 + + +image-generator-model,../../../../data/diffusion/sd_turbo_f16.gguf +image-generator-steps,4 +image-generator-width,512 +image-generator-height,512 +image-generator-gpu-layers,20 +image-generator-batch-size,1 + +video-generator-model,../../../../data/diffusion/zeroscope_v2_576w +video-generator-frames,24 +video-generator-fps,8 +video-generator-width,320 +video-generator-height,576 +video-generator-gpu-layers,15 +video-generator-batch-size,1 + +botmodels-enabled,true +botmodels-host,0.0.0.0 +botmodels-port,8085 + +default-generator,all