From a21292daa30a35d2a1fc3792f7b8b9d2d6aa7392 Mon Sep 17 00:00:00 2001
From: "Rodrigo Rodriguez (Pragmatismo)" <me@rodrigorodriguez.com>
Date: Sat, 29 Nov 2025 20:40:08 -0300
Subject: [PATCH] Add multimodal module for botmodels integration

Introduces IMAGE, VIDEO, AUDIO, and SEE keywords for BASIC scripts that
connect to the botmodels service for AI-powered media generation and
vision/captioning capabilities.

- Add BotModelsClient for HTTP communication with botmodels service
- Implement BASIC keywords: IMAGE, VIDEO, AUDIO (generation), SEE
  (captioning)
- Support configuration via config.csv for models
---
 docs/multimodal-config.md                     | 191 +++++
 src/basic/keywords/mod.rs                     |   1 +
 src/basic/keywords/multimodal.rs              | 323 +++++++++
 src/basic/mod.rs                              |  10 +
 src/lib.rs                                    |   1 +
 src/multimodal/mod.rs                         | 652 ++++++++++++++++++
 .../default.gbai/default.gbot/config.csv      |  22 +
 7 files changed, 1200 insertions(+)
 create mode 100644 docs/multimodal-config.md
 create mode 100644 src/basic/keywords/multimodal.rs
 create mode 100644 src/multimodal/mod.rs

diff --git a/docs/multimodal-config.md b/docs/multimodal-config.md
new file mode 100644
index 00000000..a95cc6b9
--- /dev/null
+++ b/docs/multimodal-config.md
@@ -0,0 +1,191 @@
+# Multimodal Configuration Guide
+
+This document describes how to configure botserver to use the botmodels service for image, video, audio generation, and vision/captioning capabilities.
+
+## Overview
+
+The multimodal feature connects botserver to botmodels - a Python-based service similar to llama.cpp but for multimodal AI tasks. This enables BASIC scripts to generate images, videos, audio, and analyze visual content.
+
+## Configuration Keys
+
+Add the following configuration to your bot's `config.csv` file:
+
+### Image Generator Settings
+
+| Key | Default | Description |
+|-----|---------|-------------|
+| `image-generator-model` | - | Path to the image generation model (e.g., `../../../../data/diffusion/sd_turbo_f16.gguf`) |
+| `image-generator-steps` | `4` | Number of inference steps for image generation |
+| `image-generator-width` | `512` | Output image width in pixels |
+| `image-generator-height` | `512` | Output image height in pixels |
+| `image-generator-gpu-layers` | `20` | Number of layers to offload to GPU |
+| `image-generator-batch-size` | `1` | Batch size for generation |
+
+### Video Generator Settings
+
+| Key | Default | Description |
+|-----|---------|-------------|
+| `video-generator-model` | - | Path to the video generation model (e.g., `../../../../data/diffusion/zeroscope_v2_576w`) |
+| `video-generator-frames` | `24` | Number of frames to generate |
+| `video-generator-fps` | `8` | Frames per second for output video |
+| `video-generator-width` | `320` | Output video width in pixels |
+| `video-generator-height` | `576` | Output video height in pixels |
+| `video-generator-gpu-layers` | `15` | Number of layers to offload to GPU |
+| `video-generator-batch-size` | `1` | Batch size for generation |
+
+### BotModels Service Settings
+
+| Key | Default | Description |
+|-----|---------|-------------|
+| `botmodels-enabled` | `false` | Enable/disable botmodels integration |
+| `botmodels-host` | `0.0.0.0` | Host address for botmodels service |
+| `botmodels-port` | `8085` | Port for botmodels service |
+| `botmodels-api-key` | - | API key for authentication with botmodels |
+| `botmodels-https` | `false` | Use HTTPS for connection to botmodels |
+
+## Example config.csv
+
+```csv
+key,value
+image-generator-model,../../../../data/diffusion/sd_turbo_f16.gguf
+image-generator-steps,4
+image-generator-width,512
+image-generator-height,512
+image-generator-gpu-layers,20
+image-generator-batch-size,1
+video-generator-model,../../../../data/diffusion/zeroscope_v2_576w
+video-generator-frames,24
+video-generator-fps,8
+video-generator-width,320
+video-generator-height,576
+video-generator-gpu-layers,15
+video-generator-batch-size,1
+botmodels-enabled,true
+botmodels-host,0.0.0.0
+botmodels-port,8085
+botmodels-api-key,your-secret-key
+botmodels-https,false
+```
+
+## BASIC Keywords
+
+Once configured, the following keywords become available in BASIC scripts:
+
+### IMAGE
+
+Generate an image from a text prompt.
+
+```basic
+file = IMAGE "a cute cat playing with yarn"
+SEND FILE TO user, file
+```
+
+### VIDEO
+
+Generate a video from a text prompt.
+
+```basic
+file = VIDEO "a rocket launching into space"
+SEND FILE TO user, file
+```
+
+### AUDIO
+
+Generate speech audio from text.
+
+```basic
+file = AUDIO "Hello, welcome to our service!"
+SEND FILE TO user, file
+```
+
+### SEE
+
+Get a caption/description of an image or video file.
+
+```basic
+caption = SEE "/path/to/image.jpg"
+TALK caption
+
+// Also works with video files
+description = SEE "/path/to/video.mp4"
+TALK description
+```
+
+## Starting BotModels Service
+
+Before using multimodal features, start the botmodels service:
+
+```bash
+cd botmodels
+python -m uvicorn src.main:app --host 0.0.0.0 --port 8085
+```
+
+Or with HTTPS:
+
+```bash
+python -m uvicorn src.main:app --host 0.0.0.0 --port 8085 --ssl-keyfile key.pem --ssl-certfile cert.pem
+```
+
+## API Endpoints (BotModels)
+
+The botmodels service exposes these REST endpoints:
+
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/api/image/generate` | POST | Generate image from prompt |
+| `/api/video/generate` | POST | Generate video from prompt |
+| `/api/speech/generate` | POST | Generate speech from text |
+| `/api/speech/totext` | POST | Convert audio to text |
+| `/api/vision/describe` | POST | Get description of an image |
+| `/api/vision/describe_video` | POST | Get description of a video |
+| `/api/vision/vqa` | POST | Visual question answering |
+| `/api/health` | GET | Health check |
+
+All endpoints require the `X-API-Key` header for authentication.
+
+## Architecture
+
+```
+┌─────────────┐     HTTPS      ┌─────────────┐
+│  botserver  │ ────────────▶  │  botmodels  │
+│   (Rust)    │                │  (Python)   │
+└─────────────┘                └─────────────┘
+      │                              │
+      │ BASIC Keywords               │ AI Models
+      │ - IMAGE                      │ - Stable Diffusion
+      │ - VIDEO                      │ - Zeroscope
+      │ - AUDIO                      │ - TTS/Whisper
+      │ - SEE                        │ - BLIP2
+      ▼                              ▼
+┌─────────────┐                ┌─────────────┐
+│   config    │                │   outputs   │
+│   .csv      │                │  (files)    │
+└─────────────┘                └─────────────┘
+```
+
+## Troubleshooting
+
+### "BotModels is not enabled"
+
+Set `botmodels-enabled=true` in your config.csv.
+
+### Connection refused
+
+1. Ensure botmodels service is running
+2. Check host/port configuration
+3. Verify firewall settings
+
+### Authentication failed
+
+Ensure `botmodels-api-key` in config.csv matches `API_KEY` environment variable in botmodels.
+
+### Model not found
+
+Verify model paths are correct and models are downloaded to the expected locations.
+
+## Security Notes
+
+1. Always use HTTPS in production (`botmodels-https=true`)
+2. Use strong, unique API keys
+3. Restrict network access to botmodels service
+4. Consider running botmodels on a separate GPU server
\ No newline at end of file
diff --git a/src/basic/keywords/mod.rs b/src/basic/keywords/mod.rs
index eba23d68..662855d7 100644
--- a/src/basic/keywords/mod.rs
+++ b/src/basic/keywords/mod.rs
@@ -15,6 +15,7 @@ pub mod get;
 pub mod hear_talk;
 pub mod last;
 pub mod llm_keyword;
+pub mod multimodal;
 pub mod on;
 pub mod print;
 pub mod remember;
diff --git a/src/basic/keywords/multimodal.rs b/src/basic/keywords/multimodal.rs
new file mode 100644
index 00000000..9ba5cb9d
--- /dev/null
+++ b/src/basic/keywords/multimodal.rs
@@ -0,0 +1,323 @@
+//! Multimodal keywords for image, video, audio generation and vision/captioning
+//!
+//! Provides BASIC keywords:
+//! - IMAGE "prompt" -> generates image, returns file URL
+//! - VIDEO "prompt" -> generates video, returns file URL
+//! - AUDIO "text" -> generates speech audio, returns file URL
+//! - SEE file -> gets caption/description of image or video
+
+use crate::multimodal::BotModelsClient;
+use crate::shared::models::UserSession;
+use crate::shared::state::AppState;
+use log::{error, trace};
+use rhai::{Dynamic, Engine};
+use std::sync::Arc;
+use std::time::Duration;
+
+/// Register all multimodal keywords
+pub fn register_multimodal_keywords(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
+    image_keyword(state.clone(), user.clone(), engine);
+    video_keyword(state.clone(), user.clone(), engine);
+    audio_keyword(state.clone(), user.clone(), engine);
+    see_keyword(state.clone(), user.clone(), engine);
+}
+
+/// IMAGE "prompt" - Generate an image from text prompt
+/// Returns the URL/path to the generated image file
+pub fn image_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
+    let state_clone = Arc::clone(&state);
+    let user_clone = user.clone();
+
+    engine
+        .register_custom_syntax(&["IMAGE", "$expr$"], false, move |context, inputs| {
+            let prompt = context.eval_expression_tree(&inputs[0])?.to_string();
+
+            trace!("IMAGE keyword: generating image for prompt: {}", prompt);
+
+            let state_for_thread = Arc::clone(&state_clone);
+            let bot_id = user_clone.bot_id;
+
+            let (tx, rx) = std::sync::mpsc::channel();
+
+            std::thread::spawn(move || {
+                let rt = tokio::runtime::Builder::new_multi_thread()
+                    .worker_threads(2)
+                    .enable_all()
+                    .build();
+
+                let send_err = if let Ok(rt) = rt {
+                    let result = rt.block_on(async move {
+                        execute_image_generation(state_for_thread, bot_id, prompt).await
+                    });
+                    tx.send(result).err()
+                } else {
+                    tx.send(Err("Failed to build tokio runtime".into())).err()
+                };
+
+                if send_err.is_some() {
+                    error!("Failed to send IMAGE result");
+                }
+            });
+
+            match rx.recv_timeout(Duration::from_secs(300)) {
+                Ok(Ok(result)) => Ok(Dynamic::from(result)),
+                Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    e.to_string().into(),
+                    rhai::Position::NONE,
+                ))),
+                Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
+                    Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                        "Image generation timed out".into(),
+                        rhai::Position::NONE,
+                    )))
+                }
+                Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    format!("IMAGE thread failed: {}", e).into(),
+                    rhai::Position::NONE,
+                ))),
+            }
+        })
+        .unwrap();
+}
+
+async fn execute_image_generation(
+    state: Arc<AppState>,
+    bot_id: uuid::Uuid,
+    prompt: String,
+) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+    let client = BotModelsClient::from_state(&state, &bot_id);
+
+    if !client.is_enabled() {
+        return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into());
+    }
+
+    client.generate_image(&prompt).await
+}
+
+/// VIDEO "prompt" - Generate a video from text prompt
+/// Returns the URL/path to the generated video file
+pub fn video_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
+    let state_clone = Arc::clone(&state);
+    let user_clone = user.clone();
+
+    engine
+        .register_custom_syntax(&["VIDEO", "$expr$"], false, move |context, inputs| {
+            let prompt = context.eval_expression_tree(&inputs[0])?.to_string();
+
+            trace!("VIDEO keyword: generating video for prompt: {}", prompt);
+
+            let state_for_thread = Arc::clone(&state_clone);
+            let bot_id = user_clone.bot_id;
+
+            let (tx, rx) = std::sync::mpsc::channel();
+
+            std::thread::spawn(move || {
+                let rt = tokio::runtime::Builder::new_multi_thread()
+                    .worker_threads(2)
+                    .enable_all()
+                    .build();
+
+                let send_err = if let Ok(rt) = rt {
+                    let result = rt.block_on(async move {
+                        execute_video_generation(state_for_thread, bot_id, prompt).await
+                    });
+                    tx.send(result).err()
+                } else {
+                    tx.send(Err("Failed to build tokio runtime".into())).err()
+                };
+
+                if send_err.is_some() {
+                    error!("Failed to send VIDEO result");
+                }
+            });
+
+            // Video generation can take longer
+            match rx.recv_timeout(Duration::from_secs(600)) {
+                Ok(Ok(result)) => Ok(Dynamic::from(result)),
+                Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    e.to_string().into(),
+                    rhai::Position::NONE,
+                ))),
+                Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
+                    Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                        "Video generation timed out".into(),
+                        rhai::Position::NONE,
+                    )))
+                }
+                Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    format!("VIDEO thread failed: {}", e).into(),
+                    rhai::Position::NONE,
+                ))),
+            }
+        })
+        .unwrap();
+}
+
+async fn execute_video_generation(
+    state: Arc<AppState>,
+    bot_id: uuid::Uuid,
+    prompt: String,
+) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+    let client = BotModelsClient::from_state(&state, &bot_id);
+
+    if !client.is_enabled() {
+        return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into());
+    }
+
+    client.generate_video(&prompt).await
+}
+
+/// AUDIO "text" - Generate speech audio from text
+/// Returns the URL/path to the generated audio file
+pub fn audio_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
+    let state_clone = Arc::clone(&state);
+    let user_clone = user.clone();
+
+    engine
+        .register_custom_syntax(&["AUDIO", "$expr$"], false, move |context, inputs| {
+            let text = context.eval_expression_tree(&inputs[0])?.to_string();
+
+            trace!("AUDIO keyword: generating speech for text: {}", text);
+
+            let state_for_thread = Arc::clone(&state_clone);
+            let bot_id = user_clone.bot_id;
+
+            let (tx, rx) = std::sync::mpsc::channel();
+
+            std::thread::spawn(move || {
+                let rt = tokio::runtime::Builder::new_multi_thread()
+                    .worker_threads(2)
+                    .enable_all()
+                    .build();
+
+                let send_err = if let Ok(rt) = rt {
+                    let result = rt.block_on(async move {
+                        execute_audio_generation(state_for_thread, bot_id, text).await
+                    });
+                    tx.send(result).err()
+                } else {
+                    tx.send(Err("Failed to build tokio runtime".into())).err()
+                };
+
+                if send_err.is_some() {
+                    error!("Failed to send AUDIO result");
+                }
+            });
+
+            match rx.recv_timeout(Duration::from_secs(120)) {
+                Ok(Ok(result)) => Ok(Dynamic::from(result)),
+                Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    e.to_string().into(),
+                    rhai::Position::NONE,
+                ))),
+                Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
+                    Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                        "Audio generation timed out".into(),
+                        rhai::Position::NONE,
+                    )))
+                }
+                Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    format!("AUDIO thread failed: {}", e).into(),
+                    rhai::Position::NONE,
+                ))),
+            }
+        })
+        .unwrap();
+}
+
+async fn execute_audio_generation(
+    state: Arc<AppState>,
+    bot_id: uuid::Uuid,
+    text: String,
+) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+    let client = BotModelsClient::from_state(&state, &bot_id);
+
+    if !client.is_enabled() {
+        return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into());
+    }
+
+    client.generate_audio(&text, None, None).await
+}
+
+/// SEE file - Get caption/description of an image or video file
+/// Returns the text description of the visual content
+pub fn see_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
+    let state_clone = Arc::clone(&state);
+    let user_clone = user.clone();
+
+    engine
+        .register_custom_syntax(&["SEE", "$expr$"], false, move |context, inputs| {
+            let file_path = context.eval_expression_tree(&inputs[0])?.to_string();
+
+            trace!("SEE keyword: getting caption for file: {}", file_path);
+
+            let state_for_thread = Arc::clone(&state_clone);
+            let bot_id = user_clone.bot_id;
+
+            let (tx, rx) = std::sync::mpsc::channel();
+
+            std::thread::spawn(move || {
+                let rt = tokio::runtime::Builder::new_multi_thread()
+                    .worker_threads(2)
+                    .enable_all()
+                    .build();
+
+                let send_err = if let Ok(rt) = rt {
+                    let result = rt.block_on(async move {
+                        execute_see_caption(state_for_thread, bot_id, file_path).await
+                    });
+                    tx.send(result).err()
+                } else {
+                    tx.send(Err("Failed to build tokio runtime".into())).err()
+                };
+
+                if send_err.is_some() {
+                    error!("Failed to send SEE result");
+                }
+            });
+
+            match rx.recv_timeout(Duration::from_secs(60)) {
+                Ok(Ok(result)) => Ok(Dynamic::from(result)),
+                Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    e.to_string().into(),
+                    rhai::Position::NONE,
+                ))),
+                Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
+                    Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                        "Vision/caption timed out".into(),
+                        rhai::Position::NONE,
+                    )))
+                }
+                Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
+                    format!("SEE thread failed: {}", e).into(),
+                    rhai::Position::NONE,
+                ))),
+            }
+        })
+        .unwrap();
+}
+
+async fn execute_see_caption(
+    state: Arc<AppState>,
+    bot_id: uuid::Uuid,
+    file_path: String,
+) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+    let client = BotModelsClient::from_state(&state, &bot_id);
+
+    if !client.is_enabled() {
+        return Err("BotModels is not enabled. Set botmodels-enabled=true in config.csv".into());
+    }
+
+    // Determine if it's a video or image based on extension
+    let lower_path = file_path.to_lowercase();
+    if lower_path.ends_with(".mp4")
+        || lower_path.ends_with(".avi")
+        || lower_path.ends_with(".mov")
+        || lower_path.ends_with(".webm")
+        || lower_path.ends_with(".mkv")
+    {
+        client.describe_video(&file_path).await
+    } else {
+        client.describe_image(&file_path).await
+    }
+}
diff --git a/src/basic/mod.rs b/src/basic/mod.rs
index 776467a3..24908ace 100644
--- a/src/basic/mod.rs
+++ b/src/basic/mod.rs
@@ -23,6 +23,7 @@ use self::keywords::format::format_keyword;
 use self::keywords::get::get_keyword;
 use self::keywords::hear_talk::{hear_keyword, talk_keyword};
 use self::keywords::last::last_keyword;
+use self::keywords::multimodal::register_multimodal_keywords;
 use self::keywords::remember::remember_keyword;
 use self::keywords::save_from_unstructured::save_from_unstructured_keyword;
 use self::keywords::send_mail::send_mail_keyword;
@@ -92,6 +93,10 @@ impl ScriptService {
             &mut engine,
         );
 
+        // Register multimodal keywords (IMAGE, VIDEO, AUDIO, SEE)
+        // These connect to botmodels for image/video/audio generation and vision/captioning
+        register_multimodal_keywords(state.clone(), user.clone(), &mut engine);
+
         ScriptService { engine }
     }
     fn preprocess_basic_script(&self, script: &str) -> String {
@@ -158,6 +163,11 @@ impl ScriptService {
                 "SET USER",
                 "GET BOT MEMORY",
                 "SET BOT MEMORY",
+                "IMAGE",
+                "VIDEO",
+                "AUDIO",
+                "SEE",
+                "SEND FILE",
             ];
             let is_basic_command = basic_commands.iter().any(|&cmd| trimmed.starts_with(cmd));
             let is_control_flow = trimmed.starts_with("IF")
diff --git a/src/lib.rs b/src/lib.rs
index 30f1140b..81f8f660 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,7 @@
 // Core modules (always included)
 pub mod basic;
 pub mod core;
+pub mod multimodal;
 pub mod security;
 pub mod web;
 
diff --git a/src/multimodal/mod.rs b/src/multimodal/mod.rs
new file mode 100644
index 00000000..4a4cce83
--- /dev/null
+++ b/src/multimodal/mod.rs
@@ -0,0 +1,652 @@
+//! Multimodal module for botmodels integration
+//! Provides client for image, video, audio generation and vision/captioning
+
+use crate::config::ConfigManager;
+use crate::shared::state::AppState;
+use log::{error, info, trace};
+use reqwest::Client;
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+use uuid::Uuid;
+
+/// Configuration for botmodels connection
+#[derive(Debug, Clone)]
+pub struct BotModelsConfig {
+    pub enabled: bool,
+    pub host: String,
+    pub port: u16,
+    pub api_key: String,
+    pub use_https: bool,
+}
+
+impl BotModelsConfig {
+    pub fn from_database(config_manager: &ConfigManager, bot_id: &Uuid) -> Self {
+        let enabled = config_manager
+            .get_config(bot_id, "botmodels-enabled", Some("false"))
+            .unwrap_or_default()
+            .to_lowercase()
+            == "true";
+
+        let host = config_manager
+            .get_config(bot_id, "botmodels-host", Some("0.0.0.0"))
+            .unwrap_or_else(|_| "0.0.0.0".to_string());
+
+        let port = config_manager
+            .get_config(bot_id, "botmodels-port", Some("8085"))
+            .unwrap_or_else(|_| "8085".to_string())
+            .parse()
+            .unwrap_or(8085);
+
+        let api_key = config_manager
+            .get_config(bot_id, "botmodels-api-key", Some(""))
+            .unwrap_or_default();
+
+        let use_https = config_manager
+            .get_config(bot_id, "botmodels-https", Some("false"))
+            .unwrap_or_default()
+            .to_lowercase()
+            == "true";
+
+        Self {
+            enabled,
+            host,
+            port,
+            api_key,
+            use_https,
+        }
+    }
+
+    pub fn base_url(&self) -> String {
+        let protocol = if self.use_https { "https" } else { "http" };
+        format!("{}://{}:{}", protocol, self.host, self.port)
+    }
+}
+
+/// Image generation configuration
+#[derive(Debug, Clone)]
+pub struct ImageGeneratorConfig {
+    pub model: String,
+    pub steps: u32,
+    pub width: u32,
+    pub height: u32,
+    pub gpu_layers: u32,
+    pub batch_size: u32,
+}
+
+impl ImageGeneratorConfig {
+    pub fn from_database(config_manager: &ConfigManager, bot_id: &Uuid) -> Self {
+        Self {
+            model: config_manager
+                .get_config(bot_id, "image-generator-model", None)
+                .unwrap_or_default(),
+            steps: config_manager
+                .get_config(bot_id, "image-generator-steps", Some("4"))
+                .unwrap_or_else(|_| "4".to_string())
+                .parse()
+                .unwrap_or(4),
+            width: config_manager
+                .get_config(bot_id, "image-generator-width", Some("512"))
+                .unwrap_or_else(|_| "512".to_string())
+                .parse()
+                .unwrap_or(512),
+            height: config_manager
+                .get_config(bot_id, "image-generator-height", Some("512"))
+                .unwrap_or_else(|_| "512".to_string())
+                .parse()
+                .unwrap_or(512),
+            gpu_layers: config_manager
+                .get_config(bot_id, "image-generator-gpu-layers", Some("20"))
+                .unwrap_or_else(|_| "20".to_string())
+                .parse()
+                .unwrap_or(20),
+            batch_size: config_manager
+                .get_config(bot_id, "image-generator-batch-size", Some("1"))
+                .unwrap_or_else(|_| "1".to_string())
+                .parse()
+                .unwrap_or(1),
+        }
+    }
+}
+
+/// Video generation configuration
+#[derive(Debug, Clone)]
+pub struct VideoGeneratorConfig {
+    pub model: String,
+    pub frames: u32,
+    pub fps: u32,
+    pub width: u32,
+    pub height: u32,
+    pub gpu_layers: u32,
+    pub batch_size: u32,
+}
+
+impl VideoGeneratorConfig {
+    pub fn from_database(config_manager: &ConfigManager, bot_id: &Uuid) -> Self {
+        Self {
+            model: config_manager
+                .get_config(bot_id, "video-generator-model", None)
+                .unwrap_or_default(),
+            frames: config_manager
+                .get_config(bot_id, "video-generator-frames", Some("24"))
+                .unwrap_or_else(|_| "24".to_string())
+                .parse()
+                .unwrap_or(24),
+            fps: config_manager
+                .get_config(bot_id, "video-generator-fps", Some("8"))
+                .unwrap_or_else(|_| "8".to_string())
+                .parse()
+                .unwrap_or(8),
+            width: config_manager
+                .get_config(bot_id, "video-generator-width", Some("320"))
+                .unwrap_or_else(|_| "320".to_string())
+                .parse()
+                .unwrap_or(320),
+            height: config_manager
+                .get_config(bot_id, "video-generator-height", Some("576"))
+                .unwrap_or_else(|_| "576".to_string())
+                .parse()
+                .unwrap_or(576),
+            gpu_layers: config_manager
+                .get_config(bot_id, "video-generator-gpu-layers", Some("15"))
+                .unwrap_or_else(|_| "15".to_string())
+                .parse()
+                .unwrap_or(15),
+            batch_size: config_manager
+                .get_config(bot_id, "video-generator-batch-size", Some("1"))
+                .unwrap_or_else(|_| "1".to_string())
+                .parse()
+                .unwrap_or(1),
+        }
+    }
+}
+
+// API Request/Response types
+
+#[derive(Debug, Serialize)]
+pub struct ImageGenerateRequest {
+    pub prompt: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub steps: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub width: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub height: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub guidance_scale: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub seed: Option<i64>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct VideoGenerateRequest {
+    pub prompt: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub num_frames: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub fps: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub steps: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub seed: Option<i64>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct SpeechGenerateRequest {
+    pub prompt: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub voice: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct GenerationResponse {
+    pub status: String,
+    pub file_path: Option<String>,
+    pub generation_time: Option<f64>,
+    pub error: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct DescribeResponse {
+    pub description: String,
+    pub confidence: Option<f64>,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct VideoDescribeResponse {
+    pub description: String,
+    pub frame_count: Option<u32>,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct SpeechToTextResponse {
+    pub text: String,
+    pub language: Option<String>,
+    pub confidence: Option<f64>,
+}
+
+/// BotModels client for multimodal operations
+pub struct BotModelsClient {
+    client: Client,
+    config: BotModelsConfig,
+    image_config: ImageGeneratorConfig,
+    video_config: VideoGeneratorConfig,
+}
+
+impl BotModelsClient {
+    pub fn new(
+        config: BotModelsConfig,
+        image_config: ImageGeneratorConfig,
+        video_config: VideoGeneratorConfig,
+    ) -> Self {
+        let client = Client::builder()
+            .danger_accept_invalid_certs(true) // For self-signed certs in dev
+            .timeout(std::time::Duration::from_secs(300)) // 5 min timeout for generation
+            .build()
+            .unwrap_or_else(|_| Client::new());
+
+        Self {
+            client,
+            config,
+            image_config,
+            video_config,
+        }
+    }
+
+    pub fn from_state(state: &AppState, bot_id: &Uuid) -> Self {
+        let config_manager = ConfigManager::new(state.conn.clone());
+        let config = BotModelsConfig::from_database(&config_manager, bot_id);
+        let image_config = ImageGeneratorConfig::from_database(&config_manager, bot_id);
+        let video_config = VideoGeneratorConfig::from_database(&config_manager, bot_id);
+        Self::new(config, image_config, video_config)
+    }
+
+    pub fn is_enabled(&self) -> bool {
+        self.config.enabled
+    }
+
+    /// Generate an image from a text prompt
+    pub async fn generate_image(
+        &self,
+        prompt: &str,
+    ) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+        if !self.config.enabled {
+            return Err("BotModels is not enabled".into());
+        }
+
+        let url = format!("{}/api/image/generate", self.config.base_url());
+        trace!("Generating image at {}: {}", url, prompt);
+
+        let request = ImageGenerateRequest {
+            prompt: prompt.to_string(),
+            steps: Some(self.image_config.steps),
+            width: Some(self.image_config.width),
+            height: Some(self.image_config.height),
+            guidance_scale: Some(7.5),
+            seed: None,
+        };
+
+        let response = self
+            .client
+            .post(&url)
+            .header("X-API-Key", &self.config.api_key)
+            .json(&request)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let error_text = response.text().await.unwrap_or_default();
+            error!("Image generation failed: {}", error_text);
+            return Err(format!("Image generation failed: {}", error_text).into());
+        }
+
+        let result: GenerationResponse = response.json().await?;
+
+        if result.status == "completed" {
+            if let Some(file_path) = result.file_path {
+                let full_url = format!("{}{}", self.config.base_url(), file_path);
+                info!("Image generated: {}", full_url);
+                return Ok(full_url);
+            }
+        }
+
+        Err(result
+            .error
+            .unwrap_or_else(|| "Unknown error".to_string())
+            .into())
+    }
+
+    /// Generate a video from a text prompt
+    pub async fn generate_video(
+        &self,
+        prompt: &str,
+    ) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+        if !self.config.enabled {
+            return Err("BotModels is not enabled".into());
+        }
+
+        let url = format!("{}/api/video/generate", self.config.base_url());
+        trace!("Generating video at {}: {}", url, prompt);
+
+        let request = VideoGenerateRequest {
+            prompt: prompt.to_string(),
+            num_frames: Some(self.video_config.frames),
+            fps: Some(self.video_config.fps),
+            steps: Some(50),
+            seed: None,
+        };
+
+        let response = self
+            .client
+            .post(&url)
+            .header("X-API-Key", &self.config.api_key)
+            .json(&request)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let error_text = response.text().await.unwrap_or_default();
+            error!("Video generation failed: {}", error_text);
+            return Err(format!("Video generation failed: {}", error_text).into());
+        }
+
+        let result: GenerationResponse = response.json().await?;
+
+        if result.status == "completed" {
+            if let Some(file_path) = result.file_path {
+                let full_url = format!("{}{}", self.config.base_url(), file_path);
+                info!("Video generated: {}", full_url);
+                return Ok(full_url);
+            }
+        }
+
+        Err(result
+            .error
+            .unwrap_or_else(|| "Unknown error".to_string())
+            .into())
+    }
+
+    /// Generate audio/speech from text
+    pub async fn generate_audio(
+        &self,
+        text: &str,
+        voice: Option<&str>,
+        language: Option<&str>,
+    ) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+        if !self.config.enabled {
+            return Err("BotModels is not enabled".into());
+        }
+
+        let url = format!("{}/api/speech/generate", self.config.base_url());
+        trace!("Generating audio at {}: {}", url, text);
+
+        let request = SpeechGenerateRequest {
+            prompt: text.to_string(),
+            voice: voice.map(String::from),
+            language: language.map(String::from),
+        };
+
+        let response = self
+            .client
+            .post(&url)
+            .header("X-API-Key", &self.config.api_key)
+            .json(&request)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let error_text = response.text().await.unwrap_or_default();
+            error!("Audio generation failed: {}", error_text);
+            return Err(format!("Audio generation failed: {}", error_text).into());
+        }
+
+        let result: GenerationResponse = response.json().await?;
+
+        if result.status == "completed" {
+            if let Some(file_path) = result.file_path {
+                let full_url = format!("{}{}", self.config.base_url(), file_path);
+                info!("Audio generated: {}", full_url);
+                return Ok(full_url);
+            }
+        }
+
+        Err(result
+            .error
+            .unwrap_or_else(|| "Unknown error".to_string())
+            .into())
+    }
+
+    /// Get caption/description for an image
+    pub async fn describe_image(
+        &self,
+        image_url_or_path: &str,
+    ) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+        if !self.config.enabled {
+            return Err("BotModels is not enabled".into());
+        }
+
+        let url = format!("{}/api/vision/describe", self.config.base_url());
+        trace!("Describing image at {}: {}", url, image_url_or_path);
+
+        // If it's a URL, download the image first
+        let image_data = if image_url_or_path.starts_with("http") {
+            let response = self.client.get(image_url_or_path).send().await?;
+            response.bytes().await?.to_vec()
+        } else {
+            tokio::fs::read(image_url_or_path).await?
+        };
+
+        let form = reqwest::multipart::Form::new().part(
+            "file",
+            reqwest::multipart::Part::bytes(image_data)
+                .file_name("image.png")
+                .mime_str("image/png")?,
+        );
+
+        let response = self
+            .client
+            .post(&url)
+            .header("X-API-Key", &self.config.api_key)
+            .multipart(form)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let error_text = response.text().await.unwrap_or_default();
+            error!("Image description failed: {}", error_text);
+            return Err(format!("Image description failed: {}", error_text).into());
+        }
+
+        let result: DescribeResponse = response.json().await?;
+        info!("Image described: {}", result.description);
+        Ok(result.description)
+    }
+
+    /// Get caption/description for a video
+    pub async fn describe_video(
+        &self,
+        video_url_or_path: &str,
+    ) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+        if !self.config.enabled {
+            return Err("BotModels is not enabled".into());
+        }
+
+        let url = format!("{}/api/vision/describe_video", self.config.base_url());
+        trace!("Describing video at {}: {}", url, video_url_or_path);
+
+        let video_data = if video_url_or_path.starts_with("http") {
+            let response = self.client.get(video_url_or_path).send().await?;
+            response.bytes().await?.to_vec()
+        } else {
+            tokio::fs::read(video_url_or_path).await?
+        };
+
+        let form = reqwest::multipart::Form::new().part(
+            "file",
+            reqwest::multipart::Part::bytes(video_data)
+                .file_name("video.mp4")
+                .mime_str("video/mp4")?,
+        );
+
+        let response = self
+            .client
+            .post(&url)
+            .header("X-API-Key", &self.config.api_key)
+            .multipart(form)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let error_text = response.text().await.unwrap_or_default();
+            error!("Video description failed: {}", error_text);
+            return Err(format!("Video description failed: {}", error_text).into());
+        }
+
+        let result: VideoDescribeResponse = response.json().await?;
+        info!("Video described: {}", result.description);
+        Ok(result.description)
+    }
+
+    /// Convert speech to text
+    pub async fn speech_to_text(
+        &self,
+        audio_url_or_path: &str,
+    ) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
+        if !self.config.enabled {
+            return Err("BotModels is not enabled".into());
+        }
+
+        let url = format!("{}/api/speech/totext", self.config.base_url());
+        trace!(
+            "Converting speech to text at {}: {}",
+            url,
+            audio_url_or_path
+        );
+
+        let audio_data = if audio_url_or_path.starts_with("http") {
+            let response = self.client.get(audio_url_or_path).send().await?;
+            response.bytes().await?.to_vec()
+        } else {
+            tokio::fs::read(audio_url_or_path).await?
+        };
+
+        let form = reqwest::multipart::Form::new().part(
+            "file",
+            reqwest::multipart::Part::bytes(audio_data)
+                .file_name("audio.wav")
+                .mime_str("audio/wav")?,
+        );
+
+        let response = self
+            .client
+            .post(&url)
+            .header("X-API-Key", &self.config.api_key)
+            .multipart(form)
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            let error_text = response.text().await.unwrap_or_default();
+            error!("Speech to text failed: {}", error_text);
+            return Err(format!("Speech to text failed: {}", error_text).into());
+        }
+
+        let result: SpeechToTextResponse = response.json().await?;
+        info!("Speech converted: {}", result.text);
+        Ok(result.text)
+    }
+
+    /// Check if botmodels server is healthy
+    pub async fn health_check(&self) -> bool {
+        if !self.config.enabled {
+            return false;
+        }
+
+        let url = format!("{}/api/health", self.config.base_url());
+        match self.client.get(&url).send().await {
+            Ok(response) => response.status().is_success(),
+            Err(_) => false,
+        }
+    }
+
+    /// Download generated file to local path
+    pub async fn download_file(
+        &self,
+        url: &str,
+        local_path: &str,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        let response = self.client.get(url).send().await?;
+        let bytes = response.bytes().await?;
+        tokio::fs::write(local_path, bytes).await?;
+        Ok(())
+    }
+}
+
+/// Ensure botmodels server is running (similar to llama.cpp startup)
+pub async fn ensure_botmodels_running(
+    app_state: Arc<AppState>,
+) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    use crate::shared::models::schema::bots::dsl::*;
+    use diesel::prelude::*;
+
+    let config_values = {
+        let conn_arc = app_state.conn.clone();
+        let default_bot_id = tokio::task::spawn_blocking(move || {
+            let mut conn = conn_arc.get().unwrap();
+            bots.filter(name.eq("default"))
+                .select(id)
+                .first::<uuid::Uuid>(&mut *conn)
+                .unwrap_or_else(|_| uuid::Uuid::nil())
+        })
+        .await?;
+
+        let config_manager = ConfigManager::new(app_state.conn.clone());
+        let config = BotModelsConfig::from_database(&config_manager, &default_bot_id);
+        config
+    };
+
+    if !config_values.enabled {
+        info!("BotModels is disabled, skipping startup");
+        return Ok(());
+    }
+
+    info!("Checking BotModels server status...");
+    info!("  Host: {}", config_values.host);
+    info!("  Port: {}", config_values.port);
+
+    let client = BotModelsClient::new(
+        config_values.clone(),
+        ImageGeneratorConfig {
+            model: String::new(),
+            steps: 4,
+            width: 512,
+            height: 512,
+            gpu_layers: 20,
+            batch_size: 1,
+        },
+        VideoGeneratorConfig {
+            model: String::new(),
+            frames: 24,
+            fps: 8,
+            width: 320,
+            height: 576,
+            gpu_layers: 15,
+            batch_size: 1,
+        },
+    );
+
+    // Check if already running
+    if client.health_check().await {
+        info!("BotModels server is already running and healthy");
+        return Ok(());
+    }
+
+    info!("BotModels server not responding, it should be started externally");
+    info!(
+        "Start botmodels with: cd botmodels && python -m uvicorn src.main:app --host {} --port {}",
+        config_values.host, config_values.port
+    );
+
+    Ok(())
+}
diff --git a/templates/default.gbai/default.gbot/config.csv b/templates/default.gbai/default.gbot/config.csv
index 0d473420..135788d1 100644
--- a/templates/default.gbai/default.gbot/config.csv
+++ b/templates/default.gbai/default.gbot/config.csv
@@ -49,3 +49,25 @@ custom-password,
 website-expires,1d
 website-max-depth,3
 website-max-pages,100
+
+
+image-generator-model,../../../../data/diffusion/sd_turbo_f16.gguf
+image-generator-steps,4
+image-generator-width,512
+image-generator-height,512
+image-generator-gpu-layers,20
+image-generator-batch-size,1
+
+video-generator-model,../../../../data/diffusion/zeroscope_v2_576w
+video-generator-frames,24
+video-generator-fps,8
+video-generator-width,320
+video-generator-height,576
+video-generator-gpu-layers,15
+video-generator-batch-size,1
+
+botmodels-enabled,true
+botmodels-host,0.0.0.0
+botmodels-port,8085
+
+default-generator,all