use crate::core::config::ConfigManager; use crate::core::kb::embedding_generator::set_embedding_server_ready; use crate::core::shared::memory_monitor::{log_jemalloc_stats, MemoryStats}; use crate::security::command_guard::SafeCommand; use crate::core::shared::models::schema::bots::dsl::*; use crate::core::shared::state::AppState; use diesel::prelude::*; use log::{error, info, trace, warn}; use reqwest; use std::fmt::Write; use std::sync::Arc; use tokio; pub async fn ensure_llama_servers_running( app_state: Arc, ) -> Result<(), Box> { trace!("ensure_llama_servers_running ENTER"); let start_mem = MemoryStats::current(); trace!( "[LLM_LOCAL] ensure_llama_servers_running START, RSS={}", MemoryStats::format_bytes(start_mem.rss_bytes) ); log_jemalloc_stats(); if std::env::var("SKIP_LLM_SERVER").is_ok() { trace!("SKIP_LLM_SERVER set, returning early"); info!("SKIP_LLM_SERVER set - skipping local LLM server startup (using mock/external LLM)"); return Ok(()); } let config_values = { let conn_arc = app_state.conn.clone(); let (default_bot_id, _default_bot_name) = tokio::task::spawn_blocking(move || -> Result<(uuid::Uuid, String), String> { let mut conn = conn_arc .get() .map_err(|e| format!("failed to get db connection: {e}"))?; Ok(crate::core::bot::get_default_bot(&mut conn)) }) .await??; let config_manager = ConfigManager::new(app_state.conn.clone()); info!("Reading config for bot_id: {}", default_bot_id); let embedding_model_result = config_manager.get_config(&default_bot_id, "embedding-model", None); info!("embedding-model config result: {:?}", embedding_model_result); ( default_bot_id, config_manager .get_config(&default_bot_id, "llm-server", Some("true")) .unwrap_or_else(|_| "true".to_string()), config_manager .get_config(&default_bot_id, "llm-url", Some("http://localhost:8081")) .unwrap_or_else(|_| "http://localhost:8081".to_string()), config_manager .get_config(&default_bot_id, "llm-model", None) .unwrap_or_default(), config_manager .get_config(&default_bot_id, "embedding-url", Some("http://localhost:8082")) .unwrap_or_else(|_| "http://localhost:8082".to_string()), embedding_model_result.unwrap_or_default(), config_manager .get_config(&default_bot_id, "llm-server-path", None) .unwrap_or_default(), ) }; let ( _default_bot_id, llm_server_enabled, llm_url, llm_model, embedding_url, embedding_model, llm_server_path, ) = config_values; let llm_server_enabled = llm_server_enabled.to_lowercase() == "true"; // Use default models when config is empty (no default.gbai/config.csv) let llm_server_path = if llm_server_path.is_empty() { "./botserver-stack/bin/llm/build/bin".to_string() } else { llm_server_path }; let llm_model = if llm_model.is_empty() { info!("No LLM model configured, using default: DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf"); "DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf".to_string() } else { llm_model }; let embedding_model = if embedding_model.is_empty() { info!("No embedding model configured, using default: bge-small-en-v1.5-f32.gguf"); "bge-small-en-v1.5-f32.gguf".to_string() } else { embedding_model }; // For llama-server startup, use path relative to botserver root // The models are in ./data/llm/ and the llama-server runs from botserver root let llm_model_path = format!("./data/llm/{}", llm_model); let embedding_model_path = format!("./data/llm/{}", embedding_model); if !llm_server_enabled { info!("Local LLM server management disabled (llm-server=false). Using external endpoints."); info!(" LLM URL: {llm_url}"); info!(" Embedding URL: {embedding_url}"); return Ok(()); } info!("Starting LLM servers..."); info!("Configuration:"); info!(" LLM URL: {llm_url}"); info!(" Embedding URL: {embedding_url}"); info!(" LLM Model: {llm_model}"); info!(" Embedding Model: {embedding_model}"); info!(" LLM Server Path: {llm_server_path}"); info!("Restarting any existing llama-server processes..."); trace!("About to pkill llama-server..."); let before_pkill = MemoryStats::current(); trace!( "[LLM_LOCAL] Before pkill, RSS={}", MemoryStats::format_bytes(before_pkill.rss_bytes) ); let pkill_result = SafeCommand::new("sh") .and_then(|c| c.arg("-c")) .and_then(|c| c.trusted_shell_script_arg("pkill llama-server -9; true")); match pkill_result { Ok(cmd) => { if let Err(e) = cmd.execute() { error!("Failed to execute pkill for llama-server: {e}"); } else { tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; info!("Existing llama-server processes terminated (if any)"); } } Err(e) => error!("Failed to build pkill command: {e}"), } trace!("pkill done"); let after_pkill = MemoryStats::current(); trace!( "[LLM_LOCAL] After pkill, RSS={} (delta={})", MemoryStats::format_bytes(after_pkill.rss_bytes), MemoryStats::format_bytes(after_pkill.rss_bytes.saturating_sub(before_pkill.rss_bytes)) ); let llm_running = if llm_url.starts_with("https://") { info!("Using external HTTPS LLM server, skipping local startup"); true } else { is_server_running(&llm_url).await }; let embedding_running = if embedding_url.starts_with("https://") { info!("Using external HTTPS embedding server, skipping local startup"); true } else { is_server_running(&embedding_url).await }; if llm_running && embedding_running { info!("Both LLM and Embedding servers are already running"); return Ok(()); } let mut tasks = vec![]; if !llm_running && !llm_model.is_empty() { info!("Starting LLM server..."); let app_state_clone = Arc::clone(&app_state); let llm_server_path_clone = llm_server_path.clone(); let llm_model_path_clone = llm_model_path.clone(); let llm_url_clone = llm_url.clone(); tasks.push(tokio::spawn(async move { start_llm_server( app_state_clone, llm_server_path_clone, llm_model_path_clone, llm_url_clone, ) })); } else if llm_model.is_empty() { info!("LLM_MODEL not set, skipping LLM server"); } if !embedding_running && !embedding_model.is_empty() { info!("Starting Embedding server..."); tasks.push(tokio::spawn(start_embedding_server( llm_server_path.clone(), embedding_model_path.clone(), embedding_url.clone(), ))); } else if embedding_model.is_empty() { info!("EMBEDDING_MODEL not set, skipping Embedding server"); } for task in tasks { task.await??; } info!("Waiting for servers to become ready..."); trace!("Starting wait loop for servers..."); let before_wait = MemoryStats::current(); trace!( "[LLM_LOCAL] Before wait loop, RSS={}", MemoryStats::format_bytes(before_wait.rss_bytes) ); let mut llm_ready = llm_running || llm_model.is_empty(); let mut embedding_ready = embedding_running || embedding_model.is_empty(); let mut attempts = 0; let max_attempts = 120; while attempts < max_attempts && (!llm_ready || !embedding_ready) { trace!("Wait loop iteration {}", attempts); tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; if attempts % 5 == 0 { let loop_mem = MemoryStats::current(); trace!( "[LLM_LOCAL] Wait loop attempt {}, RSS={} (delta from start={})", attempts, MemoryStats::format_bytes(loop_mem.rss_bytes), MemoryStats::format_bytes(loop_mem.rss_bytes.saturating_sub(before_wait.rss_bytes)) ); log_jemalloc_stats(); } if attempts % 5 == 0 { info!( "Checking server health (attempt {}/{max_attempts})...", attempts + 1 ); } if !llm_ready && !llm_model.is_empty() { if is_server_running(&llm_url).await { info!("LLM server ready at {llm_url}"); llm_ready = true; } else { info!("LLM server not ready yet"); } } if !embedding_ready && !embedding_model.is_empty() { if is_server_running(&embedding_url).await { info!("Embedding server ready at {embedding_url}"); embedding_ready = true; set_embedding_server_ready(true); } else if attempts % 10 == 0 { warn!("Embedding server not ready yet at {embedding_url}"); if let Ok(log_content) = std::fs::read_to_string(format!("{llm_server_path}/llmembd-stdout.log")) { let last_lines: Vec<&str> = log_content.lines().rev().take(5).collect(); if !last_lines.is_empty() { info!("Embedding server log (last 5 lines):"); for line in last_lines.iter().rev() { info!(" {line}"); } } } } } attempts += 1; if attempts % 20 == 0 { warn!( "Still waiting for servers... (attempt {attempts}/{max_attempts}) - this may take a while for large models" ); } } if llm_ready && embedding_ready { info!("All llama.cpp servers are ready and responding!"); if !embedding_model.is_empty() { set_embedding_server_ready(true); } trace!("Servers ready!"); let after_ready = MemoryStats::current(); trace!( "[LLM_LOCAL] Servers ready, RSS={} (delta from start={})", MemoryStats::format_bytes(after_ready.rss_bytes), MemoryStats::format_bytes(after_ready.rss_bytes.saturating_sub(start_mem.rss_bytes)) ); log_jemalloc_stats(); let _llm_provider1 = Arc::new(crate::llm::OpenAIClient::new( llm_model.clone(), Some(llm_url.clone()), None, )); let end_mem = MemoryStats::current(); trace!( "[LLM_LOCAL] ensure_llama_servers_running END, RSS={} (total delta={})", MemoryStats::format_bytes(end_mem.rss_bytes), MemoryStats::format_bytes(end_mem.rss_bytes.saturating_sub(start_mem.rss_bytes)) ); log_jemalloc_stats(); trace!("ensure_llama_servers_running EXIT OK"); Ok(()) } else { let mut error_msg = "Servers failed to start within timeout:".to_string(); if !llm_ready && !llm_model.is_empty() { let _ = write!(error_msg, "\n - LLM server at {llm_url}"); } if !embedding_ready && !embedding_model.is_empty() { let _ = write!(error_msg, "\n - Embedding server at {embedding_url}"); } Err(error_msg.into()) } } pub async fn is_server_running(url: &str) -> bool { let client = reqwest::Client::builder() .timeout(std::time::Duration::from_secs(5)) .build() .unwrap_or_default(); match client.get(format!("{url}/health")).send().await { Ok(response) => { if response.status().is_success() { return true; } info!("Health check returned status: {}", response.status()); false } Err(e) => match client.get(url).send().await { Ok(response) => response.status().is_success(), Err(_) => { if !e.is_connect() { warn!("Health check error for {url}: {e}"); } false } }, } } pub fn start_llm_server( app_state: Arc, llama_cpp_path: String, model_path: String, url: String, ) -> Result<(), Box> { let port = extract_port(&url); std::env::set_var("OMP_NUM_THREADS", "20"); std::env::set_var("OMP_PLACES", "cores"); std::env::set_var("OMP_PROC_BIND", "close"); let conn = app_state.conn.clone(); let config_manager = ConfigManager::new(conn.clone()); let mut conn = conn.get().map_err(|e| { Box::new(std::io::Error::other( format!("failed to get db connection: {e}"), )) as Box })?; let default_bot_id = bots .filter(name.eq("default")) .select(id) .first::(&mut *conn) .unwrap_or_else(|_| uuid::Uuid::nil()); let n_moe = config_manager .get_config(&default_bot_id, "llm-server-n-moe", None) .unwrap_or_else(|_| "4".to_string()); let n_moe = if n_moe.is_empty() { "4".to_string() } else { n_moe }; let parallel = config_manager .get_config(&default_bot_id, "llm-server-parallel", None) .unwrap_or_else(|_| "1".to_string()); let parallel = if parallel.is_empty() { "1".to_string() } else { parallel }; let cont_batching = config_manager .get_config(&default_bot_id, "llm-server-cont-batching", None) .unwrap_or_else(|_| "true".to_string()); let cont_batching = if cont_batching.is_empty() { "true".to_string() } else { cont_batching }; let mlock = config_manager .get_config(&default_bot_id, "llm-server-mlock", None) .unwrap_or_else(|_| "true".to_string()); let mlock = if mlock.is_empty() { "true".to_string() } else { mlock }; let no_mmap = config_manager .get_config(&default_bot_id, "llm-server-no-mmap", None) .unwrap_or_else(|_| "true".to_string()); let no_mmap = if no_mmap.is_empty() { "true".to_string() } else { no_mmap }; let gpu_layers = config_manager .get_config(&default_bot_id, "llm-server-gpu-layers", None) .unwrap_or_else(|_| "20".to_string()); let gpu_layers = if gpu_layers.is_empty() { "20".to_string() } else { gpu_layers }; let reasoning_format = config_manager .get_config(&default_bot_id, "llm-server-reasoning-format", None) .unwrap_or_else(|_| String::new()); let n_predict = config_manager .get_config(&default_bot_id, "llm-server-n-predict", None) .unwrap_or_else(|_| "512".to_string()); // Increased default for DeepSeek R1 reasoning let n_predict = if n_predict.is_empty() { "512".to_string() } else { n_predict }; let n_ctx_size = config_manager .get_config(&default_bot_id, "llm-server-ctx-size", None) .unwrap_or_else(|_| "32000".to_string()); let n_ctx_size = if n_ctx_size.is_empty() { "32000".to_string() } else { n_ctx_size }; let cmd_path = if cfg!(windows) { format!("{}\\llama-server.exe", llama_cpp_path) } else { format!("{}/llama-server", llama_cpp_path) }; let mut command = std::process::Command::new(&cmd_path); command.arg("-m").arg(&model_path) .arg("--host").arg("0.0.0.0") .arg("--port").arg(port) .arg("--top_p").arg("0.95") .arg("--temp").arg("0.6") .arg("--repeat-penalty").arg("1.2") .arg("--n-gpu-layers").arg(&gpu_layers) .arg("--ubatch-size").arg("2048"); if !reasoning_format.is_empty() { command.arg("--reasoning-format").arg(&reasoning_format); } if n_moe != "0" { command.arg("--n-cpu-moe").arg(&n_moe); } if parallel != "1" { command.arg("--parallel").arg(¶llel); } if cont_batching == "true" { command.arg("--cont-batching"); } if mlock == "true" { command.arg("--mlock"); } if no_mmap == "true" { command.arg("--no-mmap"); } if n_predict != "0" { command.arg("--n-predict").arg(&n_predict); } command.arg("--ctx-size").arg(&n_ctx_size); command.arg("--verbose"); if cfg!(windows) { command.current_dir(&llama_cpp_path); } let log_file_path = if cfg!(windows) { format!("{}\\llm-stdout.log", llama_cpp_path) } else { format!("{}/llm-stdout.log", llama_cpp_path) }; match std::fs::File::create(&log_file_path) { Ok(log_file) => { if let Ok(clone) = log_file.try_clone() { command.stdout(std::process::Stdio::from(clone)); } else { command.stdout(std::process::Stdio::null()); } command.stderr(std::process::Stdio::from(log_file)); } Err(_) => { command.stdout(std::process::Stdio::null()); command.stderr(std::process::Stdio::null()); } } info!("Executing LLM server command: {:?}", command); command.spawn().map_err(|e| { Box::new(std::io::Error::other(e.to_string())) as Box })?; Ok(()) } pub async fn start_embedding_server( llama_cpp_path: String, model_path: String, url: String, ) -> Result<(), Box> { let port = extract_port(&url); // model_path is already the full path (constructed with ../../../../data/llm/ prefix) // Only prepend llama_cpp_path if model_path is a simple filename (not a path) let full_model_path = if model_path.contains('/') || model_path.contains('.') { // model_path is already a full or relative path, use as-is model_path.clone() } else { // model_path is just a filename, prepend llama_cpp_path format!("{llama_cpp_path}/{model_path}") }; if !std::path::Path::new(&full_model_path).exists() { error!("Embedding model file not found: {full_model_path}"); return Err(format!("Embedding model file not found: {full_model_path}").into()); } info!("Starting embedding server on port {port} with model: {model_path}"); let cmd_path = if cfg!(windows) { format!("{}\\llama-server.exe", llama_cpp_path) } else { format!("{}/llama-server", llama_cpp_path) }; let mut command = std::process::Command::new(&cmd_path); command.arg("-m").arg(&model_path) .arg("--host").arg("0.0.0.0") .arg("--port").arg(port) .arg("--embedding") .arg("--n-gpu-layers").arg("99") .arg("--verbose"); if !cfg!(windows) { command.arg("--ubatch-size").arg("2048"); } if cfg!(windows) { command.current_dir(&llama_cpp_path); } let log_file_path = if cfg!(windows) { format!("{}\\stdout.log", llama_cpp_path) } else { format!("{}/llmembd-stdout.log", llama_cpp_path) }; match std::fs::File::create(&log_file_path) { Ok(log_file) => { if let Ok(clone) = log_file.try_clone() { command.stdout(std::process::Stdio::from(clone)); } else { command.stdout(std::process::Stdio::null()); } command.stderr(std::process::Stdio::from(log_file)); } Err(_) => { command.stdout(std::process::Stdio::null()); command.stderr(std::process::Stdio::null()); } } info!("Executing embedding server command: {:?}", command); command.spawn().map_err(|e| { Box::new(std::io::Error::other(e.to_string())) as Box })?; tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; Ok(()) } fn extract_port(url: &str) -> &str { url.rsplit(':').next().unwrap_or("8081") }