From 061c14b4a2f3618214664a2efd7341736664af07 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Tue, 30 Dec 2025 22:42:32 -0300 Subject: [PATCH] Fix tasks UI, WebSocket progress, memory monitoring, and app generator Tasks UI fixes: - Fix task list to query auto_tasks table instead of tasks table - Fix task detail endpoint to use UUID binding for auto_tasks query - Add proper filter handling: complete, active, awaiting, paused, blocked - Add TaskStats fields: awaiting, paused, blocked, time_saved - Add /api/tasks/time-saved endpoint - Add count-all to stats HTML response App generator improvements: - Add AgentActivity struct for detailed terminal-style progress - Add emit_activity method for rich progress events - Add detailed logging for LLM calls with timing - Track files_written, tables_synced, bytes_generated Memory and performance: - Add memory_monitor module for tracking RSS and thread activity - Skip 0-byte files in drive monitor and document processor - Change DRIVE_MONITOR checking logs from info to trace - Remove unused profile_section macro WebSocket progress: - Ensure TaskProgressEvent includes activity field - Add with_activity builder method --- Cargo.toml | 5 + config/directory_config.json | 8 +- src/auto_task/app_generator.rs | 283 +++++++++++++- src/auto_task/autotask_api.rs | 4 +- src/auto_task/intent_classifier.rs | 107 ++++-- src/auto_task/mod.rs | 147 ++++++++ src/console/status_panel.rs | 234 ++++++++---- src/core/automation/mod.rs | 7 +- src/core/bootstrap/mod.rs | 180 ++++++--- src/core/kb/document_processor.rs | 10 +- src/core/kb/embedding_generator.rs | 262 ++++++++++--- src/core/kb/kb_indexer.rs | 60 ++- src/core/package_manager/facade.rs | 4 +- src/core/package_manager/installer.rs | 46 ++- src/core/secrets/mod.rs | 6 +- src/core/shared/memory_monitor.rs | 506 ++++++++++++++++++++++++++ src/core/shared/mod.rs | 1 + src/core/shared/state.rs | 276 ++++++++++++++ src/core/shared/test_utils.rs | 6 +- src/core/shared/utils.rs | 27 ++ src/drive/drive_monitor/mod.rs | 259 +++++++++++-- src/drive/mod.rs | 205 ++++++++--- src/llm/claude.rs | 121 ++++-- src/llm/local.rs | 53 ++- src/main.rs | 51 ++- src/security/integration.rs | 6 +- src/tasks/mod.rs | 293 ++++++++++++--- 27 files changed, 2717 insertions(+), 450 deletions(-) create mode 100644 src/core/shared/memory_monitor.rs diff --git a/Cargo.toml b/Cargo.toml index 3cc1d5c2e..b24b72a56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,7 @@ monitoring = ["dep:sysinfo"] automation = ["dep:rhai"] grpc = ["dep:tonic"] progress-bars = ["dep:indicatif"] +jemalloc = ["dep:tikv-jemallocator", "dep:tikv-jemalloc-ctl"] # ===== META FEATURES (BUNDLES) ===== full = [ @@ -216,6 +217,10 @@ tonic = { version = "0.14.2", features = ["transport"], optional = true } # UI Enhancement (progress-bars feature) indicatif = { version = "0.18.0", optional = true } smartstring = "1.0.1" + +# Memory allocator (jemalloc feature) +tikv-jemallocator = { version = "0.6", optional = true } +tikv-jemalloc-ctl = { version = "0.6", features = ["stats"], optional = true } scopeguard = "1.2.0" # Vault secrets management diff --git a/config/directory_config.json b/config/directory_config.json index d12ac678e..85dbd5b30 100644 --- a/config/directory_config.json +++ b/config/directory_config.json @@ -1,7 +1,7 @@ { "base_url": "http://localhost:8300", "default_org": { - "id": "353229789043097614", + "id": "353379211173429262", "name": "default", "domain": "default.localhost" }, @@ -13,8 +13,8 @@ "first_name": "Admin", "last_name": "User" }, - "admin_token": "7QNrwws4y1X5iIuTUCtXpQj9RoQf4fYi144yEY87tNAbLMZOOD57t3YDAqCtIyIkBS1EZ5k", + "admin_token": "pY9ruIghlJlAVn-a-vpbtM0L9yQ3WtweXkXJKEk2aVBL4HEeIppxCA8MPx60ZjQJRghq9zU", "project_id": "", - "client_id": "353229789848469518", - "client_secret": "COd3gKdMO43jkUztTckCNtHrjxa5RtcIlBn7Cbp4GFoXs6mw6iZalB3m4Vv3FK5Y" + "client_id": "353379211962023950", + "client_secret": "vD8uaGZubV4pOMqxfFkGc0YOfmzYII8W7L25V7cGWieQlw0UHuvDQkSuJbQ3Rhgp" } \ No newline at end of file diff --git a/src/auto_task/app_generator.rs b/src/auto_task/app_generator.rs index 0e96262b5..3d63e59f2 100644 --- a/src/auto_task/app_generator.rs +++ b/src/auto_task/app_generator.rs @@ -5,7 +5,7 @@ use crate::basic::keywords::table_definition::{ use crate::core::config::ConfigManager; use crate::core::shared::get_content_type; use crate::core::shared::models::UserSession; -use crate::core::shared::state::AppState; +use crate::core::shared::state::{AgentActivity, AppState}; use aws_sdk_s3::primitives::ByteStream; use chrono::{DateTime, Utc}; use diesel::prelude::*; @@ -143,18 +143,90 @@ struct LlmFile { pub struct AppGenerator { state: Arc, + task_id: Option, + generation_start: Option, + files_written: Vec, + tables_synced: Vec, + bytes_generated: u64, } impl AppGenerator { pub fn new(state: Arc) -> Self { - Self { state } + Self { + state, + task_id: None, + generation_start: None, + files_written: Vec::new(), + tables_synced: Vec::new(), + bytes_generated: 0, + } + } + + pub fn with_task_id(state: Arc, task_id: impl Into) -> Self { + Self { + state, + task_id: Some(task_id.into()), + generation_start: None, + files_written: Vec::new(), + tables_synced: Vec::new(), + bytes_generated: 0, + } + } + + fn emit_activity(&self, step: &str, message: &str, current: u8, total: u8, activity: AgentActivity) { + if let Some(ref task_id) = self.task_id { + self.state.emit_activity(task_id, step, message, current, total, activity); + } + } + + fn calculate_speed(&self, items_done: u32) -> (f32, Option) { + if let Some(start) = self.generation_start { + let elapsed = start.elapsed().as_secs_f32(); + if elapsed > 0.0 { + let speed = (items_done as f32 / elapsed) * 60.0; + return (speed, None); + } + } + (0.0, None) + } + + fn build_activity(&self, phase: &str, items_done: u32, items_total: Option, current_item: Option<&str>) -> AgentActivity { + let (speed, eta) = self.calculate_speed(items_done); + let mut activity = AgentActivity::new(phase) + .with_progress(items_done, items_total) + .with_bytes(self.bytes_generated); + + if speed > 0.0 { + activity = activity.with_speed(speed, eta); + } + + if !self.files_written.is_empty() { + activity = activity.with_files(self.files_written.clone()); + } + + if !self.tables_synced.is_empty() { + activity = activity.with_tables(self.tables_synced.clone()); + } + + if let Some(item) = current_item { + activity = activity.with_current_item(item); + } + + activity } pub async fn generate_app( - &self, + &mut self, intent: &str, session: &UserSession, ) -> Result> { + const TOTAL_STEPS: u8 = 8; + + self.generation_start = Some(std::time::Instant::now()); + self.files_written.clear(); + self.tables_synced.clear(); + self.bytes_generated = 0; + info!( "Generating app from intent: {}", &intent[..intent.len().min(100)] @@ -168,23 +240,82 @@ impl AppGenerator { ), ); + if let Some(ref task_id) = self.task_id { + self.state.emit_task_started(task_id, &format!("Generating app: {}", &intent[..intent.len().min(50)]), TOTAL_STEPS); + } + + let activity = self.build_activity("analyzing", 0, Some(TOTAL_STEPS as u32), Some("Sending request to LLM")); + self.emit_activity( + "llm_request", + "Analyzing request with AI...", + 1, + TOTAL_STEPS, + activity + ); + + info!("[APP_GENERATOR] Calling generate_complete_app_with_llm for intent: {}", &intent[..intent.len().min(50)]); + let llm_start = std::time::Instant::now(); + let llm_app = match self.generate_complete_app_with_llm(intent, session.bot_id).await { Ok(app) => { + let llm_elapsed = llm_start.elapsed(); + info!("[APP_GENERATOR] LLM generation completed in {:?}: app={}, files={}, tables={}", + llm_elapsed, app.name, app.files.len(), app.tables.len()); log_generator_info( &app.name, "LLM successfully generated app structure and files", ); + + let total_bytes: u64 = app.files.iter().map(|f| f.content.len() as u64).sum(); + self.bytes_generated = total_bytes; + + let activity = self.build_activity( + "parsing", + 1, + Some(TOTAL_STEPS as u32), + Some(&format!("Generated {} with {} files", app.name, app.files.len())) + ); + self.emit_activity( + "llm_response", + &format!("AI generated {} structure", app.name), + 2, + TOTAL_STEPS, + activity + ); app } Err(e) => { + let llm_elapsed = llm_start.elapsed(); + error!("[APP_GENERATOR] LLM generation failed after {:?}: {}", llm_elapsed, e); log_generator_error("unknown", "LLM app generation failed", &e.to_string()); + if let Some(ref task_id) = self.task_id { + self.state.emit_task_error(task_id, "llm_request", &e.to_string()); + } return Err(e); } }; + let activity = self.build_activity("parsing", 2, Some(TOTAL_STEPS as u32), Some(&format!("Processing {} structure", llm_app.name))); + self.emit_activity("parse_structure", &format!("Parsing {} structure...", llm_app.name), 3, TOTAL_STEPS, activity); + let tables = Self::convert_llm_tables(&llm_app.tables); if !tables.is_empty() { + let table_names: Vec = tables.iter().map(|t| t.name.clone()).collect(); + let activity = self.build_activity( + "database", + 3, + Some(TOTAL_STEPS as u32), + Some(&format!("Creating tables: {}", table_names.join(", "))) + ); + self.emit_activity( + "create_tables", + &format!("Creating {} database tables...", tables.len()), + 4, + TOTAL_STEPS, + activity + ); + let tables_bas_content = Self::generate_table_definitions(&tables)?; if let Err(e) = self.append_to_tables_bas(session.bot_id, &tables_bas_content) { log_generator_error( @@ -203,6 +334,20 @@ impl AppGenerator { result.tables_created, result.fields_added ), ); + self.tables_synced = table_names; + let activity = self.build_activity( + "database", + 4, + Some(TOTAL_STEPS as u32), + Some(&format!("{} tables, {} fields created", result.tables_created, result.fields_added)) + ); + self.emit_activity( + "tables_synced", + "Database tables created", + 4, + TOTAL_STEPS, + activity + ); } Err(e) => { log_generator_error(&llm_app.name, "Failed to sync tables", &e.to_string()); @@ -214,9 +359,37 @@ impl AppGenerator { let bucket_name = format!("{}.gbai", bot_name.to_lowercase()); let drive_app_path = format!(".gbdrive/apps/{}", llm_app.name); + let total_files = llm_app.files.len(); + let activity = self.build_activity("writing", 0, Some(total_files as u32), Some("Preparing files")); + self.emit_activity( + "write_files", + &format!("Writing {} app files...", total_files), + 5, + TOTAL_STEPS, + activity + ); + let mut pages = Vec::new(); - for file in &llm_app.files { + for (idx, file) in llm_app.files.iter().enumerate() { let drive_path = format!("{}/{}", drive_app_path, file.filename); + + self.files_written.push(file.filename.clone()); + self.bytes_generated += file.content.len() as u64; + + let activity = self.build_activity( + "writing", + (idx + 1) as u32, + Some(total_files as u32), + Some(&file.filename) + ); + self.emit_activity( + "write_file", + &format!("Writing {}", file.filename), + 5, + TOTAL_STEPS, + activity + ); + if let Err(e) = self .write_to_drive(&bucket_name, &drive_path, &file.content) .await @@ -236,7 +409,12 @@ impl AppGenerator { }); } + self.files_written.push("designer.js".to_string()); + let activity = self.build_activity("configuring", total_files as u32, Some(total_files as u32), Some("designer.js")); + self.emit_activity("write_designer", "Creating designer configuration...", 6, TOTAL_STEPS, activity); + let designer_js = Self::generate_designer_js(&llm_app.name); + self.bytes_generated += designer_js.len() as u64; self.write_to_drive( &bucket_name, &format!("{}/designer.js", drive_app_path), @@ -246,8 +424,24 @@ impl AppGenerator { let mut tools = Vec::new(); if let Some(llm_tools) = &llm_app.tools { - for tool in llm_tools { + let tools_count = llm_tools.len(); + let activity = self.build_activity("tools", 0, Some(tools_count as u32), Some("Creating BASIC tools")); + self.emit_activity( + "write_tools", + &format!("Creating {} tools...", tools_count), + 7, + TOTAL_STEPS, + activity + ); + + for (idx, tool) in llm_tools.iter().enumerate() { let tool_path = format!(".gbdialog/tools/{}", tool.filename); + self.files_written.push(format!("tools/{}", tool.filename)); + self.bytes_generated += tool.content.len() as u64; + + let activity = self.build_activity("tools", (idx + 1) as u32, Some(tools_count as u32), Some(&tool.filename)); + self.emit_activity("write_tool", &format!("Writing tool {}", tool.filename), 7, TOTAL_STEPS, activity); + if let Err(e) = self .write_to_drive(&bucket_name, &tool_path, &tool.content) .await @@ -268,8 +462,24 @@ impl AppGenerator { let mut schedulers = Vec::new(); if let Some(llm_schedulers) = &llm_app.schedulers { - for scheduler in llm_schedulers { + let sched_count = llm_schedulers.len(); + let activity = self.build_activity("schedulers", 0, Some(sched_count as u32), Some("Creating schedulers")); + self.emit_activity( + "write_schedulers", + &format!("Creating {} schedulers...", sched_count), + 7, + TOTAL_STEPS, + activity + ); + + for (idx, scheduler) in llm_schedulers.iter().enumerate() { let scheduler_path = format!(".gbdialog/schedulers/{}", scheduler.filename); + self.files_written.push(format!("schedulers/{}", scheduler.filename)); + self.bytes_generated += scheduler.content.len() as u64; + + let activity = self.build_activity("schedulers", (idx + 1) as u32, Some(sched_count as u32), Some(&scheduler.filename)); + self.emit_activity("write_scheduler", &format!("Writing scheduler {}", scheduler.filename), 7, TOTAL_STEPS, activity); + if let Err(e) = self .write_to_drive(&bucket_name, &scheduler_path, &scheduler.content) .await @@ -288,16 +498,22 @@ impl AppGenerator { } } + let activity = self.build_activity("syncing", TOTAL_STEPS as u32 - 1, Some(TOTAL_STEPS as u32), Some("Deploying to site")); + self.emit_activity("sync_site", "Syncing app to site...", 8, TOTAL_STEPS, activity); + self.sync_app_to_site_root(&bucket_name, &llm_app.name, session.bot_id) .await?; + let elapsed = self.generation_start.map(|s| s.elapsed().as_secs()).unwrap_or(0); + log_generator_info( &llm_app.name, &format!( - "App generated: {} files, {} tables, {} tools", + "App generated: {} files, {} tables, {} tools in {}s", pages.len(), tables.len(), - tools.len() + tools.len(), + elapsed ), ); @@ -306,6 +522,24 @@ impl AppGenerator { llm_app.name, bucket_name, drive_app_path ); + if let Some(ref task_id) = self.task_id { + let final_activity = AgentActivity::new("completed") + .with_progress(TOTAL_STEPS as u32, Some(TOTAL_STEPS as u32)) + .with_bytes(self.bytes_generated) + .with_files(self.files_written.clone()) + .with_tables(self.tables_synced.clone()); + + let event = crate::core::shared::state::TaskProgressEvent::new(task_id, "complete", &format!( + "App '{}' created: {} files, {} tables, {} bytes in {}s", + llm_app.name, pages.len(), tables.len(), self.bytes_generated, elapsed + )) + .with_progress(TOTAL_STEPS, TOTAL_STEPS) + .with_activity(final_activity) + .completed(); + + self.state.broadcast_task_progress(event); + } + Ok(GeneratedApp { id: Uuid::new_v4().to_string(), name: llm_app.name, @@ -436,8 +670,14 @@ guid, string, text, integer, decimal, boolean, date, datetime, json === USER REQUEST === "{intent}" -=== YOUR TASK === -Generate a complete application based on the user's request. +=== YOLO MODE - JUST BUILD IT === +DO NOT ask questions. DO NOT request clarification. Just CREATE the app NOW. + +If user says "calculator" → build a full-featured calculator with basic ops, scientific functions, history +If user says "CRM" → build customer management with contacts, companies, deals, notes +If user says "inventory" → build stock tracking with products, categories, movements +If user says "booking" → build appointment scheduler with calendar, slots, confirmations +If user says ANYTHING → interpret creatively and BUILD SOMETHING AWESOME Respond with a single JSON object: {{ @@ -469,15 +709,16 @@ Respond with a single JSON object: ] }} -IMPORTANT: -- For simple utilities (calculator, timer, converter): tables can be empty [], focus on files +CRITICAL RULES: +- For utilities (calculator, timer, converter, BMI, mortgage): tables = [], focus on interactive HTML/JS - For data apps (CRM, inventory): design proper tables and CRUD pages -- Generate ALL files completely - no shortcuts +- Generate ALL files completely - no placeholders, no "...", no shortcuts - CSS must be comprehensive with variables, responsive design, dark mode - Every HTML page needs proper structure with all required scripts - Replace APP_NAME_HERE with actual app name in data-app-name attribute +- BE CREATIVE - add extra features the user didn't ask for but would love -Respond with valid JSON only."# +Respond with valid JSON only. NO QUESTIONS. JUST BUILD."# ); let response = self.call_llm(&prompt, bot_id).await?; @@ -557,7 +798,6 @@ Respond with valid JSON only."# ) -> Result> { #[cfg(feature = "llm")] { - // Get model and key from bot configuration let config_manager = ConfigManager::new(self.state.conn.clone()); let model = config_manager .get_config(&bot_id, "llm-model", None) @@ -579,15 +819,24 @@ Respond with valid JSON only."# "max_tokens": 16000 }); + let prompt_len = prompt.len(); + info!("[APP_GENERATOR] Starting LLM call: model={}, prompt_len={} chars", model, prompt_len); + let start = std::time::Instant::now(); + match self .state .llm_provider .generate(prompt, &llm_config, &model, &key) .await { - Ok(response) => return Ok(response), + Ok(response) => { + let elapsed = start.elapsed(); + info!("[APP_GENERATOR] LLM call succeeded: response_len={} chars, elapsed={:?}", response.len(), elapsed); + return Ok(response); + } Err(e) => { - warn!("LLM call failed: {}", e); + let elapsed = start.elapsed(); + error!("[APP_GENERATOR] LLM call failed after {:?}: {}", elapsed, e); return Err(e); } } diff --git a/src/auto_task/autotask_api.rs b/src/auto_task/autotask_api.rs index 1f7e959a0..37b0fd299 100644 --- a/src/auto_task/autotask_api.rs +++ b/src/auto_task/autotask_api.rs @@ -324,11 +324,11 @@ pub async fn create_and_execute_handler( // Update status to running let _ = update_task_status_db(&state, task_id, "running", None); - // Use IntentClassifier to classify and process + // Use IntentClassifier to classify and process with task tracking let classifier = IntentClassifier::new(Arc::clone(&state)); match classifier - .classify_and_process(&request.intent, &session) + .classify_and_process_with_task_id(&request.intent, &session, Some(task_id.to_string())) .await { Ok(result) => { diff --git a/src/auto_task/intent_classifier.rs b/src/auto_task/intent_classifier.rs index 9e337f531..064deaeee 100644 --- a/src/auto_task/intent_classifier.rs +++ b/src/auto_task/intent_classifier.rs @@ -169,29 +169,20 @@ impl IntentClassifier { &self, intent: &str, session: &UserSession, + ) -> Result> { + self.classify_and_process_with_task_id(intent, session, None).await + } + + /// Classify and then process the intent through the appropriate handler with task tracking + pub async fn classify_and_process_with_task_id( + &self, + intent: &str, + session: &UserSession, + task_id: Option, ) -> Result> { let classification = self.classify(intent, session).await?; - // If clarification is needed, return early - if classification.requires_clarification { - return Ok(IntentResult { - success: false, - intent_type: classification.intent_type, - message: classification - .clarification_question - .unwrap_or_else(|| "Could you please provide more details?".to_string()), - created_resources: Vec::new(), - app_url: None, - task_id: None, - schedule_id: None, - tool_triggers: Vec::new(), - next_steps: vec!["Provide more information".to_string()], - error: None, - }); - } - - // Route to appropriate handler - self.process_classified_intent(&classification, session) + self.process_classified_intent_with_task_id(&classification, session, task_id) .await } @@ -200,6 +191,16 @@ impl IntentClassifier { &self, classification: &ClassifiedIntent, session: &UserSession, + ) -> Result> { + self.process_classified_intent_with_task_id(classification, session, None).await + } + + /// Process a classified intent through the appropriate handler with task tracking + pub async fn process_classified_intent_with_task_id( + &self, + classification: &ClassifiedIntent, + session: &UserSession, + task_id: Option, ) -> Result> { info!( "Processing {} intent: {}", @@ -208,7 +209,7 @@ impl IntentClassifier { ); match classification.intent_type { - IntentType::AppCreate => self.handle_app_create(classification, session).await, + IntentType::AppCreate => self.handle_app_create(classification, session, task_id).await, IntentType::Todo => self.handle_todo(classification, session), IntentType::Monitor => self.handle_monitor(classification, session), IntentType::Action => self.handle_action(classification, session).await, @@ -231,8 +232,9 @@ impl IntentClassifier { USER REQUEST: "{intent}" INTENT TYPES: -- APP_CREATE: Create a full application (CRM, inventory, booking system, etc.) - Keywords: "create app", "build system", "make application", "CRM", "management system" +- APP_CREATE: Create a full application, utility, calculator, tool, system, etc. + Keywords: "create", "build", "make", "calculator", "app", "system", "CRM", "tool" + IMPORTANT: If user wants to CREATE anything (app, calculator, converter, timer, etc), classify as APP_CREATE - TODO: Simple task or reminder Keywords: "call", "remind me", "don't forget", "tomorrow", "later" @@ -252,6 +254,9 @@ INTENT TYPES: - TOOL: Create a voice/chat command Keywords: "when I say", "create command", "shortcut for", "trigger" +YOLO MODE: NEVER ask for clarification. Always make a decision and proceed. +For APP_CREATE: Just build whatever makes sense. A "calculator" = basic calculator app. A "CRM" = customer management app. Be creative and decisive. + Respond with JSON only: {{ "intent_type": "APP_CREATE|TODO|MONITOR|ACTION|SCHEDULE|GOAL|TOOL|UNKNOWN", @@ -269,13 +274,17 @@ Respond with JSON only: "suggested_name": "short name for the resource", "requires_clarification": false, "clarification_question": null, - "alternatives": [ - {{"type": "OTHER_TYPE", "confidence": 0.3, "reason": "could also be..."}} - ] + "alternatives": [] }}"# ); + info!("[INTENT_CLASSIFIER] Starting LLM call for classification, prompt_len={} chars", prompt.len()); + let start = std::time::Instant::now(); + let response = self.call_llm(&prompt, bot_id).await?; + let elapsed = start.elapsed(); + info!("[INTENT_CLASSIFIER] LLM classification completed in {:?}, response_len={} chars", elapsed, response.len()); + trace!("LLM classification response: {}", &response[..response.len().min(500)]); Self::parse_classification_response(&response, intent) } @@ -321,8 +330,19 @@ Respond with JSON only: reason: String, } + // Clean response - remove markdown code blocks if present + let cleaned = response + .trim() + .trim_start_matches("```json") + .trim_start_matches("```JSON") + .trim_start_matches("```") + .trim_end_matches("```") + .trim(); + + trace!("Cleaned classification response: {}", &cleaned[..cleaned.len().min(300)]); + // Try to parse, fall back to heuristic classification - let parsed: Result = serde_json::from_str(response); + let parsed: Result = serde_json::from_str(cleaned); match parsed { Ok(resp) => { @@ -380,6 +400,7 @@ Respond with JSON only: } Err(e) => { warn!("Failed to parse LLM response, using heuristic: {e}"); + trace!("Raw response that failed to parse: {}", &response[..response.len().min(200)]); Self::classify_heuristic(original_intent) } } @@ -397,6 +418,31 @@ Respond with JSON only: || lower.contains("management system") || lower.contains("inventory") || lower.contains("booking") + || lower.contains("calculator") + || lower.contains("website") + || lower.contains("webpage") + || lower.contains("web page") + || lower.contains("landing page") + || lower.contains("dashboard") + || lower.contains("form") + || lower.contains("todo list") + || lower.contains("todo app") + || lower.contains("chat") + || lower.contains("blog") + || lower.contains("portfolio") + || lower.contains("store") + || lower.contains("shop") + || lower.contains("e-commerce") + || lower.contains("ecommerce") + || (lower.contains("create") && lower.contains("html")) + || (lower.contains("make") && lower.contains("html")) + || (lower.contains("build") && lower.contains("html")) + || (lower.contains("create a") && (lower.contains("simple") || lower.contains("basic"))) + || (lower.contains("make a") && (lower.contains("simple") || lower.contains("basic"))) + || (lower.contains("build a") && (lower.contains("simple") || lower.contains("basic"))) + || lower.contains("criar") + || lower.contains("fazer") + || lower.contains("construir") { (IntentType::AppCreate, 0.75) } else if lower.contains("remind") @@ -461,10 +507,15 @@ Respond with JSON only: &self, classification: &ClassifiedIntent, session: &UserSession, + task_id: Option, ) -> Result> { info!("Handling APP_CREATE intent"); - let app_generator = AppGenerator::new(self.state.clone()); + let mut app_generator = if let Some(tid) = task_id { + AppGenerator::with_task_id(self.state.clone(), tid) + } else { + AppGenerator::new(self.state.clone()) + }; match app_generator .generate_app(&classification.original_text, session) diff --git a/src/auto_task/mod.rs b/src/auto_task/mod.rs index 92b09a607..0f7207de2 100644 --- a/src/auto_task/mod.rs +++ b/src/auto_task/mod.rs @@ -33,6 +33,18 @@ pub use intent_compiler::{CompiledIntent, IntentCompiler}; pub use safety_layer::{AuditEntry, ConstraintCheckResult, SafetyLayer, SimulationResult}; use crate::core::urls::ApiUrls; +use crate::shared::state::AppState; +use axum::{ + extract::{ + ws::{Message, WebSocket, WebSocketUpgrade}, + Path, Query, State, + }, + response::IntoResponse, +}; +use futures::{SinkExt, StreamExt}; +use log::{debug, error, info, warn}; +use std::collections::HashMap; +use std::sync::Arc; pub fn configure_autotask_routes() -> axum::Router> { use axum::routing::{get, post}; @@ -102,6 +114,141 @@ pub fn configure_autotask_routes() -> axum::Router>, + Query(params): Query>, +) -> impl IntoResponse { + let task_filter = params.get("task_id").cloned(); + + info!( + "Task progress WebSocket connection request, filter: {:?}", + task_filter + ); + + ws.on_upgrade(move |socket| handle_task_progress_websocket(socket, state, task_filter)) +} + +pub async fn task_progress_by_id_websocket_handler( + ws: WebSocketUpgrade, + State(state): State>, + Path(task_id): Path, +) -> impl IntoResponse { + info!( + "Task progress WebSocket connection for task: {}", + task_id + ); + + ws.on_upgrade(move |socket| handle_task_progress_websocket(socket, state, Some(task_id))) +} + +async fn handle_task_progress_websocket( + socket: WebSocket, + state: Arc, + task_filter: Option, +) { + let (mut sender, mut receiver) = socket.split(); + + info!("Task progress WebSocket connected, filter: {:?}", task_filter); + + let welcome = serde_json::json!({ + "type": "connected", + "message": "Connected to task progress stream", + "filter": task_filter, + "timestamp": chrono::Utc::now().to_rfc3339() + }); + + if let Ok(welcome_str) = serde_json::to_string(&welcome) { + if sender.send(Message::Text(welcome_str)).await.is_err() { + error!("Failed to send welcome message to task progress WebSocket"); + return; + } + } + + let mut broadcast_rx = if let Some(broadcast_tx) = state.task_progress_broadcast.as_ref() { + broadcast_tx.subscribe() + } else { + warn!("No task progress broadcast channel available"); + return; + }; + + let task_filter_clone = task_filter.clone(); + let send_task = tokio::spawn(async move { + loop { + match broadcast_rx.recv().await { + Ok(event) => { + let should_send = task_filter_clone.is_none() + || task_filter_clone.as_ref() == Some(&event.task_id); + + if should_send { + if let Ok(json_str) = serde_json::to_string(&event) { + debug!( + "Sending task progress to WebSocket: {} - {}", + event.task_id, event.step + ); + if sender.send(Message::Text(json_str)).await.is_err() { + error!("Failed to send task progress to WebSocket"); + break; + } + } + } + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { + warn!("Task progress WebSocket lagged by {} messages", n); + } + Err(tokio::sync::broadcast::error::RecvError::Closed) => { + info!("Task progress broadcast channel closed"); + break; + } + } + } + }); + + let recv_task = tokio::spawn(async move { + while let Some(msg) = receiver.next().await { + match msg { + Ok(Message::Text(text)) => { + debug!("Received text from task progress WebSocket: {}", text); + if text == "ping" { + debug!("Received ping from task progress client"); + } + } + Ok(Message::Ping(data)) => { + debug!("Received ping from task progress WebSocket"); + drop(data); + } + Ok(Message::Pong(_)) => { + debug!("Received pong from task progress WebSocket"); + } + Ok(Message::Close(_)) => { + info!("Task progress WebSocket client disconnected"); + break; + } + Ok(Message::Binary(_)) => { + debug!("Received binary from task progress WebSocket (ignored)"); + } + Err(e) => { + error!("Task progress WebSocket error: {}", e); + break; + } + } + } + }); + + tokio::select! { + _ = send_task => { + info!("Task progress send task completed"); + } + _ = recv_task => { + info!("Task progress receive task completed"); + } + } + + info!("Task progress WebSocket connection closed, filter: {:?}", task_filter); } async fn handle_client_logs( diff --git a/src/console/status_panel.rs b/src/console/status_panel.rs index 0ea65f37d..5d4ec7dd7 100644 --- a/src/console/status_panel.rs +++ b/src/console/status_panel.rs @@ -9,11 +9,101 @@ use diesel::prelude::*; use std::sync::Arc; use sysinfo::System; +/// Cache for component status to avoid spawning pgrep processes too frequently +struct ComponentStatusCache { + statuses: Vec<(String, bool, String)>, // (name, is_running, port) + last_check: std::time::Instant, +} + +impl ComponentStatusCache { + fn new() -> Self { + Self { + statuses: Vec::new(), + last_check: std::time::Instant::now() - std::time::Duration::from_secs(60), + } + } + + fn needs_refresh(&self) -> bool { + // Only check component status every 10 seconds + self.last_check.elapsed() > std::time::Duration::from_secs(10) + } + + fn refresh(&mut self) { + let components = vec![ + ("Tables", "postgres", "5432"), + ("Cache", "valkey-server", "6379"), + ("Drive", "minio", "9000"), + ("LLM", "llama-server", "8081"), + ]; + + self.statuses.clear(); + for (comp_name, process, port) in components { + let is_running = Self::check_component_running(process); + self.statuses + .push((comp_name.to_string(), is_running, port.to_string())); + } + self.last_check = std::time::Instant::now(); + } + + fn check_component_running(process_name: &str) -> bool { + std::process::Command::new("pgrep") + .arg("-f") + .arg(process_name) + .output() + .map(|output| !output.stdout.is_empty()) + .unwrap_or(false) + } + + fn get_statuses(&self) -> &[(String, bool, String)] { + &self.statuses + } +} + +/// Cache for bot list to avoid database queries too frequently +struct BotListCache { + bot_list: Vec<(String, uuid::Uuid)>, + last_check: std::time::Instant, +} + +impl BotListCache { + fn new() -> Self { + Self { + bot_list: Vec::new(), + last_check: std::time::Instant::now() - std::time::Duration::from_secs(60), + } + } + + fn needs_refresh(&self) -> bool { + // Only query database every 5 seconds + self.last_check.elapsed() > std::time::Duration::from_secs(5) + } + + fn refresh(&mut self, app_state: &Arc) { + if let Ok(mut conn) = app_state.conn.get() { + if let Ok(list) = bots + .filter(is_active.eq(true)) + .select((name, id)) + .load::<(String, uuid::Uuid)>(&mut *conn) + { + self.bot_list = list; + } + } + self.last_check = std::time::Instant::now(); + } + + fn get_bots(&self) -> &[(String, uuid::Uuid)] { + &self.bot_list + } +} + pub struct StatusPanel { app_state: Arc, last_update: std::time::Instant, + last_system_refresh: std::time::Instant, cached_content: String, system: System, + component_cache: ComponentStatusCache, + bot_cache: BotListCache, } impl std::fmt::Debug for StatusPanel { @@ -29,24 +119,42 @@ impl std::fmt::Debug for StatusPanel { impl StatusPanel { pub fn new(app_state: Arc) -> Self { + // Only initialize with CPU and memory info, not all system info + let mut system = System::new(); + system.refresh_cpu_all(); + system.refresh_memory(); + Self { app_state, last_update: std::time::Instant::now(), + last_system_refresh: std::time::Instant::now(), cached_content: String::new(), - system: System::new_all(), + system, + component_cache: ComponentStatusCache::new(), + bot_cache: BotListCache::new(), } } pub fn update(&mut self) -> Result<(), std::io::Error> { - self.system.refresh_all(); + // Only refresh system metrics every 2 seconds instead of every call + // This is the main CPU hog - refresh_all() is very expensive + if self.last_system_refresh.elapsed() > std::time::Duration::from_secs(2) { + // Only refresh CPU and memory, not ALL system info + self.system.refresh_cpu_all(); + self.system.refresh_memory(); + self.last_system_refresh = std::time::Instant::now(); + } + + // Refresh component status cache if needed (every 10 seconds) + if self.component_cache.needs_refresh() { + self.component_cache.refresh(); + } + + // Refresh bot list cache if needed (every 5 seconds) + if self.bot_cache.needs_refresh() { + self.bot_cache.refresh(&self.app_state); + } - let _tokens = (std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .expect("system time after UNIX epoch") - .as_secs() - % 1000) as usize; - #[cfg(feature = "nvidia")] - let _system_metrics = nvidia::get_system_metrics().unwrap_or_default(); self.cached_content = self.render(None); self.last_update = std::time::Instant::now(); Ok(()) @@ -60,10 +168,11 @@ impl StatusPanel { String::new(), ]; - self.system.refresh_cpu_all(); + // Use cached CPU usage - don't refresh here let cpu_usage = self.system.global_cpu_usage(); let cpu_bar = Self::create_progress_bar(cpu_usage, 20); lines.push(format!(" CPU: {:5.1}% {}", cpu_usage, cpu_bar)); + #[cfg(feature = "nvidia")] { let system_metrics = get_system_metrics().unwrap_or_default(); @@ -81,7 +190,11 @@ impl StatusPanel { let total_mem = self.system.total_memory() as f32 / 1024.0 / 1024.0 / 1024.0; let used_mem = self.system.used_memory() as f32 / 1024.0 / 1024.0 / 1024.0; - let mem_percentage = (used_mem / total_mem) * 100.0; + let mem_percentage = if total_mem > 0.0 { + (used_mem / total_mem) * 100.0 + } else { + 0.0 + }; let mem_bar = Self::create_progress_bar(mem_percentage, 20); lines.push(format!( " MEM: {:5.1}% {} ({:.1}/{:.1} GB)", @@ -94,15 +207,9 @@ impl StatusPanel { lines.push("╚═══════════════════════════════════════╝".to_string()); lines.push("".to_string()); - let components = vec![ - ("Tables", "postgres", "5432"), - ("Cache", "valkey-server", "6379"), - ("Drive", "minio", "9000"), - ("LLM", "llama-server", "8081"), - ]; - - for (comp_name, process, port) in components { - let status = if Self::check_component_running(process) { + // Use cached component statuses instead of spawning pgrep every time + for (comp_name, is_running, port) in self.component_cache.get_statuses() { + let status = if *is_running { format!(" ONLINE [Port: {}]", port) } else { " OFFLINE".to_string() @@ -116,58 +223,44 @@ impl StatusPanel { lines.push("╚═══════════════════════════════════════╝".to_string()); lines.push("".to_string()); - if let Ok(mut conn) = self.app_state.conn.get() { - match bots - .filter(is_active.eq(true)) - .select((name, id)) - .load::<(String, uuid::Uuid)>(&mut *conn) - { - Ok(bot_list) => { - if bot_list.is_empty() { - lines.push(" No active bots".to_string()); + // Use cached bot list instead of querying database every time + let bot_list = self.bot_cache.get_bots(); + if bot_list.is_empty() { + lines.push(" No active bots".to_string()); + } else { + for (bot_name, bot_id) in bot_list { + let marker = if let Some(ref selected) = selected_bot { + if selected == bot_name { + "►" } else { - for (bot_name, bot_id) in bot_list { - let marker = if let Some(ref selected) = selected_bot { - if selected == &bot_name { - "►" - } else { - " " - } - } else { - " " - }; - lines.push(format!(" {} {}", marker, bot_name)); + " " + } + } else { + " " + }; + lines.push(format!(" {} {}", marker, bot_name)); - if let Some(ref selected) = selected_bot { - if selected == &bot_name { - lines.push("".to_string()); - lines.push(" ┌─ Bot Configuration ─────────┐".to_string()); - let config_manager = - ConfigManager::new(self.app_state.conn.clone()); - let llm_model = config_manager - .get_config(&bot_id, "llm-model", None) - .unwrap_or_else(|_| "N/A".to_string()); - lines.push(format!(" Model: {}", llm_model)); - let ctx_size = config_manager - .get_config(&bot_id, "llm-server-ctx-size", None) - .unwrap_or_else(|_| "N/A".to_string()); - lines.push(format!(" Context: {}", ctx_size)); - let temp = config_manager - .get_config(&bot_id, "llm-temperature", None) - .unwrap_or_else(|_| "N/A".to_string()); - lines.push(format!(" Temp: {}", temp)); - lines.push(" └─────────────────────────────┘".to_string()); - } - } - } + if let Some(ref selected) = selected_bot { + if selected == bot_name { + lines.push("".to_string()); + lines.push(" ┌─ Bot Configuration ─────────┐".to_string()); + let config_manager = ConfigManager::new(self.app_state.conn.clone()); + let llm_model = config_manager + .get_config(bot_id, "llm-model", None) + .unwrap_or_else(|_| "N/A".to_string()); + lines.push(format!(" Model: {}", llm_model)); + let ctx_size = config_manager + .get_config(bot_id, "llm-server-ctx-size", None) + .unwrap_or_else(|_| "N/A".to_string()); + lines.push(format!(" Context: {}", ctx_size)); + let temp = config_manager + .get_config(bot_id, "llm-temperature", None) + .unwrap_or_else(|_| "N/A".to_string()); + lines.push(format!(" Temp: {}", temp)); + lines.push(" └─────────────────────────────┘".to_string()); } } - Err(_) => { - lines.push(" Error loading bots".to_string()); - } } - } else { - lines.push(" Database locked".to_string()); } lines.push("".to_string()); @@ -195,11 +288,6 @@ impl StatusPanel { } pub fn check_component_running(process_name: &str) -> bool { - std::process::Command::new("pgrep") - .arg("-f") - .arg(process_name) - .output() - .map(|output| !output.stdout.is_empty()) - .unwrap_or(false) + ComponentStatusCache::check_component_running(process_name) } } diff --git a/src/core/automation/mod.rs b/src/core/automation/mod.rs index c4426f08f..2ca0bd09d 100644 --- a/src/core/automation/mod.rs +++ b/src/core/automation/mod.rs @@ -19,11 +19,12 @@ pub struct AutomationService { impl AutomationService { #[must_use] pub fn new(state: Arc) -> Self { - crate::llm::episodic_memory::start_episodic_memory_scheduler(Arc::clone(&state)); + // Temporarily disabled to debug CPU spike + // crate::llm::episodic_memory::start_episodic_memory_scheduler(Arc::clone(&state)); Self { state } } pub async fn spawn(self) -> Result<(), Box> { - let mut ticker = interval(Duration::from_secs(5)); + let mut ticker = interval(Duration::from_secs(60)); loop { ticker.tick().await; if let Err(e) = self.check_scheduled_tasks().await { @@ -31,7 +32,7 @@ impl AutomationService { } } } - async fn check_scheduled_tasks(&self) -> Result<(), Box> { + pub async fn check_scheduled_tasks(&self) -> Result<(), Box> { use crate::shared::models::system_automations::dsl::{ id, is_active, kind, last_triggered as lt_column, system_automations, }; diff --git a/src/core/bootstrap/mod.rs b/src/core/bootstrap/mod.rs index 7bd6bf9e3..9267a5e0e 100644 --- a/src/core/bootstrap/mod.rs +++ b/src/core/bootstrap/mod.rs @@ -185,7 +185,7 @@ impl BootstrapManager { if pm.is_installed("vault") { let vault_already_running = Command::new("sh") .arg("-c") - .arg("curl -f -s 'http://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1") + .arg("curl -f -sk 'https://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1") .stdout(std::process::Stdio::null()) .stderr(std::process::Stdio::null()) .status() @@ -208,7 +208,7 @@ impl BootstrapManager { for i in 0..10 { let vault_ready = Command::new("sh") .arg("-c") - .arg("curl -f -s 'http://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1") + .arg("curl -f -sk 'https://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1") .stdout(std::process::Stdio::null()) .stderr(std::process::Stdio::null()) .status() @@ -285,8 +285,28 @@ impl BootstrapManager { info!("Starting PostgreSQL database..."); match pm.start("tables") { Ok(_child) => { - tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; - info!("PostgreSQL started"); + let pg_isready = self.stack_dir("bin/tables/bin/pg_isready"); + let mut ready = false; + for attempt in 1..=30 { + tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + let status = std::process::Command::new(&pg_isready) + .args(["-h", "localhost", "-p", "5432", "-U", "gbuser"]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status(); + if status.map(|s| s.success()).unwrap_or(false) { + ready = true; + info!("PostgreSQL started and ready (attempt {})", attempt); + break; + } + if attempt % 5 == 0 { + info!("Waiting for PostgreSQL to be ready... (attempt {}/30)", attempt); + } + } + if !ready { + error!("PostgreSQL failed to become ready after 30 seconds"); + return Err(anyhow::anyhow!("PostgreSQL failed to start properly")); + } } Err(e) => { warn!("PostgreSQL might already be running: {}", e); @@ -321,7 +341,7 @@ impl BootstrapManager { for i in 0..15 { let drive_ready = Command::new("sh") .arg("-c") - .arg("curl -sfk 'https://127.0.0.1:9000/minio/health/live' >/dev/null 2>&1") + .arg("curl -sf --cacert ./botserver-stack/conf/drive/certs/CAs/ca.crt 'https://127.0.0.1:9000/minio/health/live' >/dev/null 2>&1") .stdout(std::process::Stdio::null()) .stderr(std::process::Stdio::null()) .status() @@ -393,7 +413,7 @@ impl BootstrapManager { if installer.is_installed("vault") { let vault_running = Command::new("sh") .arg("-c") - .arg("curl -f -s 'http://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1") + .arg("curl -f -sk 'https://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1") .stdout(std::process::Stdio::null()) .stderr(std::process::Stdio::null()) .status() @@ -539,7 +559,7 @@ impl BootstrapManager { async fn ensure_vault_unsealed(&self) -> Result<()> { let vault_init_path = self.stack_dir("conf/vault/init.json"); - let vault_addr = "http://localhost:8200"; + let vault_addr = "https://localhost:8200"; if !vault_init_path.exists() { return Err(anyhow::anyhow!( @@ -685,7 +705,7 @@ impl BootstrapManager { std::env::set_var("VAULT_ADDR", vault_addr); std::env::set_var("VAULT_TOKEN", &root_token); - std::env::set_var("VAULT_SKIP_VERIFY", "true"); + std::env::set_var("VAULT_CACERT", "./botserver-stack/conf/system/certificates/ca/ca.crt"); std::env::set_var( "VAULT_CACERT", @@ -1398,7 +1418,7 @@ meet IN A 127.0.0.1 } let health_check = std::process::Command::new("curl") - .args(["-f", "-s", "http://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200"]) + .args(["-f", "-sk", "https://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200"]) .output(); if let Ok(output) = health_check { @@ -1446,9 +1466,10 @@ meet IN A 127.0.0.1 )); } - let vault_addr = "http://localhost:8200"; + let vault_addr = "https://localhost:8200"; + let ca_cert_path = "./botserver-stack/conf/system/certificates/ca/ca.crt"; std::env::set_var("VAULT_ADDR", vault_addr); - std::env::set_var("VAULT_SKIP_VERIFY", "true"); + std::env::set_var("VAULT_CACERT", ca_cert_path); let (unseal_key, root_token) = if vault_init_path.exists() { info!("Reading Vault initialization from init.json..."); @@ -1485,8 +1506,8 @@ meet IN A 127.0.0.1 let init_output = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} {} operator init -key-shares=1 -key-threshold=1 -format=json", - vault_addr, vault_bin + "VAULT_ADDR={} VAULT_CACERT={} {} operator init -key-shares=1 -key-threshold=1 -format=json", + vault_addr, ca_cert_path, vault_bin )) .output()?; @@ -1501,8 +1522,8 @@ meet IN A 127.0.0.1 let status_check = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} {} status -format=json 2>/dev/null", - vault_addr, vault_bin + "VAULT_ADDR={} VAULT_CACERT={} {} status -format=json 2>/dev/null", + vault_addr, ca_cert_path, vault_bin )) .output(); @@ -1568,8 +1589,8 @@ meet IN A 127.0.0.1 let unseal_output = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} {} operator unseal {}", - vault_addr, vault_bin, unseal_key + "VAULT_ADDR={} VAULT_CACERT={} {} operator unseal {}", + vault_addr, ca_cert_path, vault_bin, unseal_key )) .output()?; @@ -1598,9 +1619,7 @@ meet IN A 127.0.0.1 # Vault Configuration - THESE ARE THE ONLY ALLOWED ENV VARS VAULT_ADDR={} VAULT_TOKEN={} - -# Vault uses HTTP for local development (TLS disabled in config.hcl) -# In production, enable TLS and set VAULT_CACERT, VAULT_CLIENT_CERT, VAULT_CLIENT_KEY +VAULT_CACERT=./botserver-stack/conf/system/certificates/ca/ca.crt # Cache TTL for secrets (seconds) VAULT_CACHE_TTL=300 @@ -1617,23 +1636,25 @@ VAULT_CACHE_TTL=300 } info!("Enabling KV secrets engine..."); + let ca_cert_path = "./botserver-stack/conf/system/certificates/ca/ca.crt"; let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} secrets enable -path=secret kv-v2 2>&1 || true", - vault_addr, root_token, vault_bin + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} secrets enable -path=secret kv-v2 2>&1 || true", + vault_addr, root_token, ca_cert_path, vault_bin )) .output(); info!("Storing secrets in Vault (only if not existing)..."); let vault_bin_clone = vault_bin.clone(); + let ca_cert_clone = ca_cert_path.to_string(); let secret_exists = |path: &str| -> bool { let output = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv get {} 2>/dev/null", - vault_addr, root_token, vault_bin_clone, path + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv get {} 2>/dev/null", + vault_addr, root_token, ca_cert_clone, vault_bin_clone, path )) .output(); output.map(|o| o.status.success()).unwrap_or(false) @@ -1645,8 +1666,8 @@ VAULT_CACHE_TTL=300 let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv put secret/gbo/tables host=localhost port=5432 database=botserver username=gbuser password='{}'", - vault_addr, root_token, vault_bin, db_password + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv put secret/gbo/tables host=localhost port=5432 database=botserver username=gbuser password='{}'", + vault_addr, root_token, ca_cert_path, vault_bin, db_password )) .output()?; info!(" Stored database credentials"); @@ -1658,8 +1679,8 @@ VAULT_CACHE_TTL=300 let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv put secret/gbo/drive accesskey='{}' secret='{}'", - vault_addr, root_token, vault_bin, drive_accesskey, drive_secret + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv put secret/gbo/drive accesskey='{}' secret='{}'", + vault_addr, root_token, ca_cert_path, vault_bin, drive_accesskey, drive_secret )) .output()?; info!(" Stored drive credentials"); @@ -1671,8 +1692,8 @@ VAULT_CACHE_TTL=300 let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv put secret/gbo/cache password='{}'", - vault_addr, root_token, vault_bin, cache_password + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv put secret/gbo/cache password='{}'", + vault_addr, root_token, ca_cert_path, vault_bin, cache_password )) .output()?; info!(" Stored cache credentials"); @@ -1690,8 +1711,8 @@ VAULT_CACHE_TTL=300 let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv put secret/gbo/directory url=https://localhost:8300 project_id= client_id= client_secret= masterkey={}", - vault_addr, root_token, vault_bin, masterkey + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv put secret/gbo/directory url=https://localhost:8300 project_id= client_id= client_secret= masterkey={}", + vault_addr, root_token, ca_cert_path, vault_bin, masterkey )) .output()?; info!(" Created directory placeholder with masterkey"); @@ -1703,8 +1724,8 @@ VAULT_CACHE_TTL=300 let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv put secret/gbo/llm openai_key= anthropic_key= groq_key=", - vault_addr, root_token, vault_bin + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv put secret/gbo/llm openai_key= anthropic_key= groq_key=", + vault_addr, root_token, ca_cert_path, vault_bin )) .output()?; info!(" Created LLM placeholder"); @@ -1716,8 +1737,8 @@ VAULT_CACHE_TTL=300 let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv put secret/gbo/email username= password=", - vault_addr, root_token, vault_bin + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv put secret/gbo/email username= password=", + vault_addr, root_token, ca_cert_path, vault_bin )) .output()?; info!(" Created email placeholder"); @@ -1730,8 +1751,8 @@ VAULT_CACHE_TTL=300 let _ = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv put secret/gbo/encryption master_key='{}'", - vault_addr, root_token, vault_bin, encryption_key + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv put secret/gbo/encryption master_key='{}'", + vault_addr, root_token, ca_cert_path, vault_bin, encryption_key )) .output()?; info!(" Generated and stored encryption key"); @@ -1779,6 +1800,8 @@ VAULT_CACHE_TTL=300 format!("{}/", config.drive.server) }; + info!("[S3_CLIENT] Creating S3 client with endpoint: {}", endpoint); + let (access_key, secret_key) = if config.drive.access_key.is_empty() || config.drive.secret_key.is_empty() { match crate::shared::utils::get_secrets_manager().await { @@ -1806,19 +1829,30 @@ VAULT_CACHE_TTL=300 ) }; - // Set CA cert for self-signed TLS (dev stack) let ca_cert_path = "./botserver-stack/conf/system/certificates/ca/ca.crt"; if std::path::Path::new(ca_cert_path).exists() { std::env::set_var("AWS_CA_BUNDLE", ca_cert_path); std::env::set_var("SSL_CERT_FILE", ca_cert_path); } + let timeout_config = aws_config::timeout::TimeoutConfig::builder() + .connect_timeout(std::time::Duration::from_secs(5)) + .read_timeout(std::time::Duration::from_secs(30)) + .operation_timeout(std::time::Duration::from_secs(30)) + .operation_attempt_timeout(std::time::Duration::from_secs(15)) + .build(); + + let retry_config = aws_config::retry::RetryConfig::standard() + .with_max_attempts(2); + let base_config = aws_config::defaults(BehaviorVersion::latest()) .endpoint_url(endpoint) .region("auto") .credentials_provider(aws_sdk_s3::config::Credentials::new( access_key, secret_key, None, None, "static", )) + .timeout_config(timeout_config) + .retry_config(retry_config) .load() .await; let s3_config = aws_sdk_s3::config::Builder::from(&base_config) @@ -1866,14 +1900,16 @@ VAULT_CACHE_TTL=300 { let bot_name = path.file_name().map(|n| n.to_string_lossy().to_string()).unwrap_or_default(); let bucket = bot_name.trim_start_matches('/').to_string(); - // Create bucket if it doesn't exist - if client.head_bucket().bucket(&bucket).send().await.is_err() { - if let Err(e) = client.create_bucket().bucket(&bucket).send().await { - warn!("S3/MinIO not available, skipping bucket {}: {}", bucket, e); - continue; - } + let bucket_exists = client.head_bucket().bucket(&bucket).send().await.is_ok(); + if bucket_exists { + info!("Bucket {} already exists, skipping template upload (user content preserved)", bucket); + continue; } - // Always sync templates to bucket + if let Err(e) = client.create_bucket().bucket(&bucket).send().await { + warn!("S3/MinIO not available, skipping bucket {}: {:?}", bucket, e); + continue; + } + info!("Created new bucket {}, uploading templates...", bucket); if let Err(e) = Self::upload_directory_recursive(&client, &path, &bucket, "/").await { warn!("Failed to upload templates to bucket {}: {}", bucket, e); } @@ -2091,16 +2127,18 @@ storage "file" { path = "../../data/vault" } -# Listener with TLS DISABLED for local development -# In production, enable TLS with proper certificates +# Listener with TLS enabled listener "tcp" { - address = "0.0.0.0:8200" - tls_disable = true + address = "0.0.0.0:8200" + tls_disable = false + tls_cert_file = "../../conf/system/certificates/vault/server.crt" + tls_key_file = "../../conf/system/certificates/vault/server.key" + tls_client_ca_file = "../../conf/system/certificates/ca/ca.crt" } -# API settings - use HTTP for local dev -api_addr = "http://localhost:8200" -cluster_addr = "http://localhost:8201" +# API settings - use HTTPS +api_addr = "https://localhost:8200" +cluster_addr = "https://localhost:8201" # UI enabled for administration ui = true @@ -2122,7 +2160,7 @@ log_level = "info" fs::create_dir_all(self.stack_dir("data/vault"))?; info!( - "Created Vault config with mTLS at {}", + "Created Vault config with TLS at {}", config_path.display() ); Ok(()) @@ -2293,9 +2331,15 @@ log_level = "info" params.distinguished_name = dn; for san in sans { - params - .subject_alt_names - .push(rcgen::SanType::DnsName(san.to_string().try_into()?)); + if let Ok(ip) = san.parse::() { + params + .subject_alt_names + .push(rcgen::SanType::IpAddress(ip)); + } else { + params + .subject_alt_names + .push(rcgen::SanType::DnsName(san.to_string().try_into()?)); + } } let key_pair = KeyPair::generate()?; @@ -2311,7 +2355,27 @@ log_level = "info" fs::create_dir_all(&minio_certs_dir)?; let drive_cert_dir = cert_dir.join("drive"); fs::copy(drive_cert_dir.join("server.crt"), minio_certs_dir.join("public.crt"))?; - fs::copy(drive_cert_dir.join("server.key"), minio_certs_dir.join("private.key"))?; + + let drive_key_src = drive_cert_dir.join("server.key"); + let drive_key_dst = minio_certs_dir.join("private.key"); + + let conversion_result = std::process::Command::new("openssl") + .args(["ec", "-in"]) + .arg(&drive_key_src) + .args(["-out"]) + .arg(&drive_key_dst) + .output(); + + match conversion_result { + Ok(output) if output.status.success() => { + debug!("Converted drive private key to SEC1 format for MinIO"); + } + _ => { + warn!("Could not convert drive key to SEC1 format (openssl not available?), copying as-is"); + fs::copy(&drive_key_src, &drive_key_dst)?; + } + } + let minio_ca_dir = minio_certs_dir.join("CAs"); fs::create_dir_all(&minio_ca_dir)?; fs::copy(&ca_cert_path, minio_ca_dir.join("ca.crt"))?; diff --git a/src/core/kb/document_processor.rs b/src/core/kb/document_processor.rs index 8158e6611..db13e4ca3 100644 --- a/src/core/kb/document_processor.rs +++ b/src/core/kb/document_processor.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use log::{error, info, warn}; +use log::{debug, error, info, warn}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::Path; @@ -118,6 +118,14 @@ impl DocumentProcessor { let metadata = tokio::fs::metadata(file_path).await?; let file_size = metadata.len() as usize; + if file_size == 0 { + debug!( + "Skipping empty file (0 bytes): {}", + file_path.display() + ); + return Ok(Vec::new()); + } + let format = DocumentFormat::from_extension(file_path) .ok_or_else(|| anyhow::anyhow!("Unsupported file format: {}", file_path.display()))?; diff --git a/src/core/kb/embedding_generator.rs b/src/core/kb/embedding_generator.rs index dc060c53d..2dacc9986 100644 --- a/src/core/kb/embedding_generator.rs +++ b/src/core/kb/embedding_generator.rs @@ -1,24 +1,37 @@ use anyhow::{Context, Result}; -use log::{debug, info, warn}; +use log::{info, trace, warn}; use reqwest::Client; use serde::{Deserialize, Serialize}; - +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; +use std::time::Duration; use tokio::sync::Semaphore; +use crate::core::shared::memory_monitor::{log_jemalloc_stats, MemoryStats}; use super::document_processor::TextChunk; +static EMBEDDING_SERVER_READY: AtomicBool = AtomicBool::new(false); + +pub fn is_embedding_server_ready() -> bool { + EMBEDDING_SERVER_READY.load(Ordering::SeqCst) +} + +pub fn set_embedding_server_ready(ready: bool) { + EMBEDDING_SERVER_READY.store(ready, Ordering::SeqCst); + if ready { + info!("[EMBEDDING] Embedding server marked as ready"); + } +} + #[derive(Debug, Clone)] pub struct EmbeddingConfig { pub embedding_url: String, - pub embedding_model: String, - pub dimensions: usize, - pub batch_size: usize, - pub timeout_seconds: u64, + pub max_concurrent_requests: usize, + pub connect_timeout_seconds: u64, } impl Default for EmbeddingConfig { @@ -27,8 +40,10 @@ impl Default for EmbeddingConfig { embedding_url: "http://localhost:8082".to_string(), embedding_model: "bge-small-en-v1.5".to_string(), dimensions: 384, - batch_size: 32, - timeout_seconds: 30, + batch_size: 16, + timeout_seconds: 60, + max_concurrent_requests: 2, + connect_timeout_seconds: 10, } } } @@ -36,17 +51,17 @@ impl Default for EmbeddingConfig { impl EmbeddingConfig { pub fn from_env() -> Self { let embedding_url = "http://localhost:8082".to_string(); - let embedding_model = "bge-small-en-v1.5".to_string(); - let dimensions = Self::detect_dimensions(&embedding_model); Self { embedding_url, embedding_model, dimensions, - batch_size: 32, - timeout_seconds: 30, + batch_size: 16, + timeout_seconds: 60, + max_concurrent_requests: 2, + connect_timeout_seconds: 10, } } @@ -79,12 +94,17 @@ struct EmbeddingResponse { #[derive(Debug, Deserialize)] struct EmbeddingData { embedding: Vec, - _index: usize, + #[serde(default)] + #[allow(dead_code)] + index: usize, } #[derive(Debug, Deserialize)] struct EmbeddingUsage { - _prompt_tokens: usize, + #[serde(default)] + #[allow(dead_code)] + prompt_tokens: usize, + #[serde(default)] total_tokens: usize, } @@ -115,14 +135,19 @@ impl std::fmt::Debug for KbEmbeddingGenerator { impl KbEmbeddingGenerator { pub fn new(config: EmbeddingConfig) -> Self { let client = Client::builder() - .timeout(std::time::Duration::from_secs(config.timeout_seconds)) + .timeout(Duration::from_secs(config.timeout_seconds)) + .connect_timeout(Duration::from_secs(config.connect_timeout_seconds)) + .pool_max_idle_per_host(2) + .pool_idle_timeout(Duration::from_secs(30)) + .tcp_keepalive(Duration::from_secs(60)) + .tcp_nodelay(true) .build() .unwrap_or_else(|e| { - log::warn!("Failed to create HTTP client with timeout: {}, using default", e); + warn!("Failed to create HTTP client with timeout: {}, using default", e); Client::new() }); - let semaphore = Arc::new(Semaphore::new(4)); + let semaphore = Arc::new(Semaphore::new(config.max_concurrent_requests)); Self { config, @@ -131,6 +156,65 @@ impl KbEmbeddingGenerator { } } + pub async fn check_health(&self) -> bool { + let health_url = format!("{}/health", self.config.embedding_url); + + match tokio::time::timeout( + Duration::from_secs(5), + self.client.get(&health_url).send() + ).await { + Ok(Ok(response)) => { + let is_healthy = response.status().is_success(); + if is_healthy { + set_embedding_server_ready(true); + } + is_healthy + } + Ok(Err(e)) => { + let alt_url = &self.config.embedding_url; + match tokio::time::timeout( + Duration::from_secs(5), + self.client.get(alt_url).send() + ).await { + Ok(Ok(response)) => { + let is_healthy = response.status().is_success(); + if is_healthy { + set_embedding_server_ready(true); + } + is_healthy + } + _ => { + warn!("[EMBEDDING] Health check failed: {}", e); + false + } + } + } + Err(_) => { + warn!("[EMBEDDING] Health check timed out"); + false + } + } + } + + pub async fn wait_for_server(&self, max_wait_secs: u64) -> bool { + let start = std::time::Instant::now(); + let max_wait = Duration::from_secs(max_wait_secs); + + info!("[EMBEDDING] Waiting for embedding server at {} (max {}s)...", + self.config.embedding_url, max_wait_secs); + + while start.elapsed() < max_wait { + if self.check_health().await { + info!("[EMBEDDING] Embedding server is ready after {:?}", start.elapsed()); + return true; + } + tokio::time::sleep(Duration::from_secs(2)).await; + } + + warn!("[EMBEDDING] Embedding server not available after {}s", max_wait_secs); + false + } + pub async fn generate_embeddings( &self, chunks: &[TextChunk], @@ -139,35 +223,105 @@ impl KbEmbeddingGenerator { return Ok(Vec::new()); } - info!("Generating embeddings for {} chunks", chunks.len()); + if !is_embedding_server_ready() { + info!("[EMBEDDING] Server not marked ready, checking health..."); + if !self.wait_for_server(30).await { + return Err(anyhow::anyhow!( + "Embedding server not available at {}. Skipping embedding generation.", + self.config.embedding_url + )); + } + } - let mut results = Vec::new(); + let start_mem = MemoryStats::current(); + trace!("[EMBEDDING] Generating embeddings for {} chunks, RSS={}", + chunks.len(), MemoryStats::format_bytes(start_mem.rss_bytes)); - for batch in chunks.chunks(self.config.batch_size) { - let batch_embeddings = self.generate_batch_embeddings(batch).await?; + let mut results = Vec::with_capacity(chunks.len()); + let total_batches = (chunks.len() + self.config.batch_size - 1) / self.config.batch_size; + + for (batch_num, batch) in chunks.chunks(self.config.batch_size).enumerate() { + let batch_start = MemoryStats::current(); + trace!("[EMBEDDING] Processing batch {}/{} ({} items), RSS={}", + batch_num + 1, + total_batches, + batch.len(), + MemoryStats::format_bytes(batch_start.rss_bytes)); + + let batch_embeddings = match tokio::time::timeout( + Duration::from_secs(self.config.timeout_seconds), + self.generate_batch_embeddings(batch) + ).await { + Ok(Ok(embeddings)) => embeddings, + Ok(Err(e)) => { + warn!("[EMBEDDING] Batch {} failed: {}", batch_num + 1, e); + break; + } + Err(_) => { + warn!("[EMBEDDING] Batch {} timed out after {}s", + batch_num + 1, self.config.timeout_seconds); + break; + } + }; + + let batch_end = MemoryStats::current(); + let delta = batch_end.rss_bytes.saturating_sub(batch_start.rss_bytes); + trace!("[EMBEDDING] Batch {} complete: {} embeddings, RSS={} (delta={})", + batch_num + 1, + batch_embeddings.len(), + MemoryStats::format_bytes(batch_end.rss_bytes), + MemoryStats::format_bytes(delta)); + + if delta > 100 * 1024 * 1024 { + warn!("[EMBEDDING] Excessive memory growth detected ({}), stopping early", + MemoryStats::format_bytes(delta)); + for (chunk, embedding) in batch.iter().zip(batch_embeddings.iter()) { + results.push((chunk.clone(), embedding.clone())); + } + break; + } for (chunk, embedding) in batch.iter().zip(batch_embeddings.iter()) { results.push((chunk.clone(), embedding.clone())); } + + if batch_num + 1 < total_batches { + tokio::time::sleep(Duration::from_millis(100)).await; + } } - info!("Generated {} embeddings", results.len()); + let end_mem = MemoryStats::current(); + trace!("[EMBEDDING] Generated {} embeddings, RSS={} (total delta={})", + results.len(), + MemoryStats::format_bytes(end_mem.rss_bytes), + MemoryStats::format_bytes(end_mem.rss_bytes.saturating_sub(start_mem.rss_bytes))); + log_jemalloc_stats(); Ok(results) } async fn generate_batch_embeddings(&self, chunks: &[TextChunk]) -> Result> { - let _permit = self.semaphore.acquire().await?; + let _permit = self.semaphore.acquire().await + .map_err(|e| anyhow::anyhow!("Failed to acquire semaphore: {}", e))?; let texts: Vec = chunks.iter().map(|c| c.content.clone()).collect(); + let total_chars: usize = texts.iter().map(|t| t.len()).sum(); - debug!("Generating embeddings for batch of {} texts", texts.len()); + info!("[EMBEDDING] generate_batch_embeddings: {} texts, {} total chars", + texts.len(), total_chars); - match self.generate_local_embeddings(&texts).await { - Ok(embeddings) => Ok(embeddings), + let truncated_texts: Vec = texts.into_iter() + .map(|t| if t.len() > 8192 { t[..8192].to_string() } else { t }) + .collect(); + + match self.generate_local_embeddings(&truncated_texts).await { + Ok(embeddings) => { + info!("[EMBEDDING] Local embeddings succeeded: {} vectors", embeddings.len()); + Ok(embeddings) + } Err(e) => { - warn!("Local embedding service failed: {}, trying OpenAI API", e); - self.generate_openai_embeddings(&texts) + warn!("[EMBEDDING] Local embedding service failed: {}", e); + Err(e) } } } @@ -178,6 +332,12 @@ impl KbEmbeddingGenerator { model: self.config.embedding_model.clone(), }; + let request_size = serde_json::to_string(&request) + .map(|s| s.len()) + .unwrap_or(0); + info!("[EMBEDDING] Sending request to {} (size: {} bytes)", + self.config.embedding_url, request_size); + let response = self .client .post(format!("{}/embeddings", self.config.embedding_url)) @@ -186,9 +346,10 @@ impl KbEmbeddingGenerator { .await .context("Failed to send request to embedding service")?; - if !response.status().is_success() { - let status = response.status(); - let error_text = response.text().await.unwrap_or_default(); + let status = response.status(); + if !status.is_success() { + let error_bytes = response.bytes().await.unwrap_or_default(); + let error_text = String::from_utf8_lossy(&error_bytes[..error_bytes.len().min(1024)]); return Err(anyhow::anyhow!( "Embedding service error {}: {}", status, @@ -196,12 +357,24 @@ impl KbEmbeddingGenerator { )); } - let embedding_response: EmbeddingResponse = response - .json() - .await + let response_bytes = response.bytes().await + .context("Failed to read embedding response bytes")?; + + info!("[EMBEDDING] Received response: {} bytes", response_bytes.len()); + + if response_bytes.len() > 50 * 1024 * 1024 { + return Err(anyhow::anyhow!( + "Embedding response too large: {} bytes (max 50MB)", + response_bytes.len() + )); + } + + let embedding_response: EmbeddingResponse = serde_json::from_slice(&response_bytes) .context("Failed to parse embedding response")?; - let mut embeddings = Vec::new(); + drop(response_bytes); + + let mut embeddings = Vec::with_capacity(embedding_response.data.len()); for data in embedding_response.data { embeddings.push(Embedding { vector: data.embedding, @@ -214,14 +387,14 @@ impl KbEmbeddingGenerator { Ok(embeddings) } - fn generate_openai_embeddings(&self, _texts: &[String]) -> Result> { - let _ = self; // Suppress unused self warning - Err(anyhow::anyhow!( - "OpenAI embeddings not configured - use local embedding service" - )) - } - pub async fn generate_single_embedding(&self, text: &str) -> Result { + if !is_embedding_server_ready() && !self.check_health().await { + return Err(anyhow::anyhow!( + "Embedding server not available at {}", + self.config.embedding_url + )); + } + let embeddings = self .generate_batch_embeddings(&[TextChunk { content: text.to_string(), @@ -272,6 +445,11 @@ impl EmbeddingGenerator { let embedding = self.kb_generator.generate_single_embedding(text).await?; Ok(embedding.vector) } + + /// Check if the embedding server is healthy + pub async fn check_health(&self) -> bool { + self.kb_generator.check_health().await + } } pub struct EmailEmbeddingGenerator { diff --git a/src/core/kb/kb_indexer.rs b/src/core/kb/kb_indexer.rs index 36450d37b..24d1d3cbb 100644 --- a/src/core/kb/kb_indexer.rs +++ b/src/core/kb/kb_indexer.rs @@ -1,15 +1,16 @@ use anyhow::Result; -use log::{debug, info, warn}; +use log::{debug, info, trace, warn}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::{Path, PathBuf}; use uuid::Uuid; use crate::config::ConfigManager; +use crate::core::shared::memory_monitor::{log_jemalloc_stats, MemoryStats}; use crate::shared::utils::{create_tls_client, DbPool}; use super::document_processor::{DocumentProcessor, TextChunk}; -use super::embedding_generator::{Embedding, EmbeddingConfig, KbEmbeddingGenerator}; +use super::embedding_generator::{is_embedding_server_ready, Embedding, EmbeddingConfig, KbEmbeddingGenerator}; #[derive(Debug, Clone)] pub struct QdrantConfig { @@ -94,7 +95,6 @@ impl KbIndexer { let document_processor = DocumentProcessor::default(); let embedding_generator = KbEmbeddingGenerator::new(embedding_config); - // Use shared TLS client with local CA certificate let http_client = create_tls_client(Some(qdrant_config.timeout_secs)); Self { @@ -105,7 +105,6 @@ impl KbIndexer { } } - /// Check if Qdrant vector database is available pub async fn check_qdrant_health(&self) -> Result { let health_url = format!("{}/healthz", self.qdrant_config.url); @@ -121,9 +120,24 @@ impl KbIndexer { kb_name: &str, kb_path: &Path, ) -> Result { - info!("Indexing KB folder: {} for bot {}", kb_name, bot_name); + let start_mem = MemoryStats::current(); + info!("Indexing KB folder: {} for bot {} [START RSS={}]", + kb_name, bot_name, MemoryStats::format_bytes(start_mem.rss_bytes)); + log_jemalloc_stats(); + + if !is_embedding_server_ready() { + info!("[KB_INDEXER] Embedding server not ready yet, waiting up to 60s..."); + if !self.embedding_generator.wait_for_server(60).await { + warn!( + "Embedding server is not available. KB indexing skipped. \ + Wait for the embedding server to start before indexing." + ); + return Err(anyhow::anyhow!( + "Embedding server not available. KB indexing deferred until embedding service is ready." + )); + } + } - // Check if Qdrant is available before proceeding if !self.check_qdrant_health().await.unwrap_or(false) { warn!( "Qdrant vector database is not available at {}. KB indexing skipped. \ @@ -140,27 +154,48 @@ impl KbIndexer { self.ensure_collection_exists(&collection_name).await?; + let before_docs = MemoryStats::current(); + trace!("[KB_INDEXER] Before process_kb_folder RSS={}", + MemoryStats::format_bytes(before_docs.rss_bytes)); + let documents = self.document_processor.process_kb_folder(kb_path).await?; + let after_docs = MemoryStats::current(); + trace!("[KB_INDEXER] After process_kb_folder: {} documents, RSS={} (delta={})", + documents.len(), + MemoryStats::format_bytes(after_docs.rss_bytes), + MemoryStats::format_bytes(after_docs.rss_bytes.saturating_sub(before_docs.rss_bytes))); + let mut total_chunks = 0; let mut indexed_documents = 0; for (doc_path, chunks) in documents { if chunks.is_empty() { + debug!("[KB_INDEXER] Skipping document with no chunks: {}", doc_path); continue; } - info!( - "Processing document: {} ({} chunks)", + let before_embed = MemoryStats::current(); + trace!( + "[KB_INDEXER] Processing document: {} ({} chunks) RSS={}", doc_path, - chunks.len() + chunks.len(), + MemoryStats::format_bytes(before_embed.rss_bytes) ); + trace!("[KB_INDEXER] Calling generate_embeddings for {} chunks...", chunks.len()); let embeddings = self .embedding_generator .generate_embeddings(&chunks) .await?; + let after_embed = MemoryStats::current(); + trace!("[KB_INDEXER] After generate_embeddings: {} embeddings, RSS={} (delta={})", + embeddings.len(), + MemoryStats::format_bytes(after_embed.rss_bytes), + MemoryStats::format_bytes(after_embed.rss_bytes.saturating_sub(before_embed.rss_bytes))); + log_jemalloc_stats(); + let points = Self::create_qdrant_points(&doc_path, embeddings)?; self.upsert_points(&collection_name, points).await?; @@ -171,6 +206,13 @@ impl KbIndexer { self.update_collection_metadata(&collection_name, bot_name, kb_name, total_chunks)?; + let end_mem = MemoryStats::current(); + trace!("[KB_INDEXER] Indexing complete: {} docs, {} chunks, RSS={} (total delta={})", + indexed_documents, total_chunks, + MemoryStats::format_bytes(end_mem.rss_bytes), + MemoryStats::format_bytes(end_mem.rss_bytes.saturating_sub(start_mem.rss_bytes))); + log_jemalloc_stats(); + Ok(IndexingResult { collection_name, documents_processed: indexed_documents, diff --git a/src/core/package_manager/facade.rs b/src/core/package_manager/facade.rs index 7ecd66c01..507d6b7f3 100644 --- a/src/core/package_manager/facade.rs +++ b/src/core/package_manager/facade.rs @@ -460,8 +460,8 @@ Store credentials in Vault: "drive" => { format!( r"MinIO Object Storage: - API: http://{}:9000 - Console: http://{}:9001 + API: https://{}:9000 + Console: https://{}:9001 Store credentials in Vault: botserver vault put gbo/drive server={} port=9000 accesskey=minioadmin secret=", diff --git a/src/core/package_manager/installer.rs b/src/core/package_manager/installer.rs index 07276b467..0067b19c9 100644 --- a/src/core/package_manager/installer.rs +++ b/src/core/package_manager/installer.rs @@ -252,7 +252,7 @@ impl PackageManager { ]), data_download_list: Vec::new(), exec_cmd: "nohup {{BIN_PATH}}/minio server {{DATA_PATH}} --address :9000 --console-address :9001 --certs-dir {{CONF_PATH}}/drive/certs > {{LOGS_PATH}}/minio.log 2>&1 &".to_string(), - check_cmd: "curl -sfk https://127.0.0.1:9000/minio/health/live >/dev/null 2>&1".to_string(), + check_cmd: "curl -sf --cacert {{CONF_PATH}}/drive/certs/CAs/ca.crt https://127.0.0.1:9000/minio/health/live >/dev/null 2>&1".to_string(), }, ); } @@ -891,12 +891,15 @@ storage "file" { } listener "tcp" { - address = "0.0.0.0:8200" - tls_disable = 1 + address = "0.0.0.0:8200" + tls_disable = false + tls_cert_file = "{{CONF_PATH}}/system/certificates/vault/server.crt" + tls_key_file = "{{CONF_PATH}}/system/certificates/vault/server.key" + tls_client_ca_file = "{{CONF_PATH}}/system/certificates/ca/ca.crt" } -api_addr = "http://0.0.0.0:8200" -cluster_addr = "http://0.0.0.0:8201" +api_addr = "https://localhost:8200" +cluster_addr = "https://localhost:8201" ui = true disable_mlock = true EOF"#.to_string(), @@ -914,12 +917,15 @@ storage "file" { } listener "tcp" { - address = "0.0.0.0:8200" - tls_disable = 1 + address = "0.0.0.0:8200" + tls_disable = false + tls_cert_file = "{{CONF_PATH}}/system/certificates/vault/server.crt" + tls_key_file = "{{CONF_PATH}}/system/certificates/vault/server.key" + tls_client_ca_file = "{{CONF_PATH}}/system/certificates/ca/ca.crt" } -api_addr = "http://0.0.0.0:8200" -cluster_addr = "http://0.0.0.0:8201" +api_addr = "https://localhost:8200" +cluster_addr = "https://localhost:8201" ui = true disable_mlock = true EOF"#.to_string(), @@ -933,13 +939,16 @@ EOF"#.to_string(), "VAULT_ADDR".to_string(), "https://localhost:8200".to_string(), ); - env.insert("VAULT_SKIP_VERIFY".to_string(), "true".to_string()); + env.insert( + "VAULT_CACERT".to_string(), + "./botserver-stack/conf/system/certificates/ca/ca.crt".to_string(), + ); env }, data_download_list: Vec::new(), exec_cmd: "nohup {{BIN_PATH}}/vault server -config={{CONF_PATH}}/vault/config.hcl > {{LOGS_PATH}}/vault.log 2>&1 &" .to_string(), - check_cmd: "curl -f -s --connect-timeout 2 -m 5 'http://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1" + check_cmd: "curl -f -sk --connect-timeout 2 -m 5 'https://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200' >/dev/null 2>&1" .to_string(), }, ); @@ -1165,9 +1174,10 @@ EOF"#.to_string(), } // Check if Vault is reachable before trying to fetch credentials + // Use -k for self-signed certs in dev let vault_check = std::process::Command::new("sh") .arg("-c") - .arg(format!("curl -sf {}/v1/sys/health >/dev/null 2>&1", vault_addr)) + .arg(format!("curl -sfk {}/v1/sys/health >/dev/null 2>&1", vault_addr)) .status() .map(|s| s.success()) .unwrap_or(false); @@ -1187,10 +1197,14 @@ EOF"#.to_string(), let vault_bin = base_path.join("bin/vault/vault"); let vault_bin_str = vault_bin.to_string_lossy(); + // Get CA cert path for Vault TLS + let ca_cert_path = std::env::var("VAULT_CACERT") + .unwrap_or_else(|_| base_path.join("conf/system/certificates/ca/ca.crt").to_string_lossy().to_string()); + trace!("Fetching drive credentials from Vault at {} using {}", vault_addr, vault_bin_str); let drive_cmd = format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv get -format=json secret/gbo/drive", - vault_addr, vault_token, vault_bin_str + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv get -format=json secret/gbo/drive", + vault_addr, vault_token, ca_cert_path, vault_bin_str ); match std::process::Command::new("sh") .arg("-c") @@ -1229,8 +1243,8 @@ EOF"#.to_string(), if let Ok(output) = std::process::Command::new("sh") .arg("-c") .arg(format!( - "unset VAULT_CLIENT_CERT VAULT_CLIENT_KEY VAULT_CACERT; VAULT_ADDR={} VAULT_TOKEN={} {} kv get -format=json secret/gbo/cache 2>/dev/null", - vault_addr, vault_token, vault_bin_str + "VAULT_ADDR={} VAULT_TOKEN={} VAULT_CACERT={} {} kv get -format=json secret/gbo/cache 2>/dev/null", + vault_addr, vault_token, ca_cert_path, vault_bin_str )) .output() { diff --git a/src/core/secrets/mod.rs b/src/core/secrets/mod.rs index ca066efa5..b62235001 100644 --- a/src/core/secrets/mod.rs +++ b/src/core/secrets/mod.rs @@ -90,8 +90,12 @@ impl SecretsManager { let mut settings_builder = VaultClientSettingsBuilder::default(); settings_builder.address(&addr).token(&token); + // Only warn about TLS verification for HTTPS connections + let is_https = addr.starts_with("https://"); if skip_verify { - warn!("TLS verification disabled - NOT RECOMMENDED FOR PRODUCTION"); + if is_https { + warn!("TLS verification disabled - NOT RECOMMENDED FOR PRODUCTION"); + } settings_builder.verify(false); } else { settings_builder.verify(true); diff --git a/src/core/shared/memory_monitor.rs b/src/core/shared/memory_monitor.rs new file mode 100644 index 000000000..1f1da3f13 --- /dev/null +++ b/src/core/shared/memory_monitor.rs @@ -0,0 +1,506 @@ +//! Memory and CPU monitoring with thread tracking +//! +//! This module provides tools to track memory/CPU usage per thread +//! and identify potential leaks or CPU hogs in the botserver application. +//! +//! When compiled with the `jemalloc` feature, provides detailed allocation statistics. + +use log::{debug, info, trace, warn}; +use std::collections::HashMap; +use std::sync::{LazyLock, Mutex, RwLock}; +use std::time::{Duration, Instant}; +use sysinfo::{Pid, ProcessesToUpdate, System}; + +static THREAD_REGISTRY: LazyLock>> = + LazyLock::new(|| RwLock::new(HashMap::new())); + +static COMPONENT_TRACKER: LazyLock = + LazyLock::new(|| ComponentMemoryTracker::new(60)); + +#[derive(Debug, Clone)] +pub struct ThreadInfo { + pub name: String, + pub started_at: Instant, + pub last_activity: Instant, + pub activity_count: u64, + pub component: String, +} + +pub fn register_thread(name: &str, component: &str) { + let info = ThreadInfo { + name: name.to_string(), + started_at: Instant::now(), + last_activity: Instant::now(), + activity_count: 0, + component: component.to_string(), + }; + if let Ok(mut registry) = THREAD_REGISTRY.write() { + registry.insert(name.to_string(), info); + } + trace!("[THREAD] Registered: {} (component: {})", name, component); +} + +pub fn record_thread_activity(name: &str) { + if let Ok(mut registry) = THREAD_REGISTRY.write() { + if let Some(info) = registry.get_mut(name) { + info.last_activity = Instant::now(); + info.activity_count += 1; + } + } +} + +pub fn unregister_thread(name: &str) { + if let Ok(mut registry) = THREAD_REGISTRY.write() { + registry.remove(name); + } + info!("[THREAD] Unregistered: {}", name); +} + +pub fn log_thread_stats() { + if let Ok(registry) = THREAD_REGISTRY.read() { + info!("[THREADS] Active thread count: {}", registry.len()); + for (name, info) in registry.iter() { + let uptime = info.started_at.elapsed().as_secs(); + let idle = info.last_activity.elapsed().as_secs(); + info!( + "[THREAD] {} | component={} | uptime={}s | idle={}s | activities={}", + name, info.component, uptime, idle, info.activity_count + ); + } + } +} + +#[derive(Debug, Clone)] +pub struct MemoryStats { + pub rss_bytes: u64, + pub virtual_bytes: u64, + pub timestamp: Instant, +} + +impl MemoryStats { + pub fn current() -> Self { + let (rss, virt) = get_process_memory().unwrap_or((0, 0)); + Self { + rss_bytes: rss, + virtual_bytes: virt, + timestamp: Instant::now(), + } + } + + pub fn format_bytes(bytes: u64) -> String { + const KB: u64 = 1024; + const MB: u64 = KB * 1024; + const GB: u64 = MB * 1024; + + if bytes >= GB { + format!("{:.2} GB", bytes as f64 / GB as f64) + } else if bytes >= MB { + format!("{:.2} MB", bytes as f64 / MB as f64) + } else if bytes >= KB { + format!("{:.2} KB", bytes as f64 / KB as f64) + } else { + format!("{} B", bytes) + } + } + + pub fn log(&self) { + info!( + "[MEMORY] RSS={}, Virtual={}", + Self::format_bytes(self.rss_bytes), + Self::format_bytes(self.virtual_bytes), + ); + } +} + +/// Get jemalloc memory statistics when the feature is enabled +#[cfg(feature = "jemalloc")] +pub fn get_jemalloc_stats() -> Option { + use tikv_jemalloc_ctl::{epoch, stats}; + + // Advance the epoch to refresh statistics + if epoch::advance().is_err() { + return None; + } + + let allocated = stats::allocated::read().ok()? as u64; + let active = stats::active::read().ok()? as u64; + let resident = stats::resident::read().ok()? as u64; + let mapped = stats::mapped::read().ok()? as u64; + let retained = stats::retained::read().ok()? as u64; + + Some(JemallocStats { + allocated, + active, + resident, + mapped, + retained, + }) +} + +#[cfg(not(feature = "jemalloc"))] +pub fn get_jemalloc_stats() -> Option { + None +} + +/// Jemalloc memory statistics +#[derive(Debug, Clone)] +pub struct JemallocStats { + /// Total bytes allocated by the application + pub allocated: u64, + /// Total bytes in active pages allocated by the application + pub active: u64, + /// Total bytes in physically resident pages + pub resident: u64, + /// Total bytes in active extents mapped by the allocator + pub mapped: u64, + /// Total bytes retained (not returned to OS) + pub retained: u64, +} + +impl JemallocStats { + pub fn log(&self) { + info!( + "[JEMALLOC] allocated={} active={} resident={} mapped={} retained={}", + MemoryStats::format_bytes(self.allocated), + MemoryStats::format_bytes(self.active), + MemoryStats::format_bytes(self.resident), + MemoryStats::format_bytes(self.mapped), + MemoryStats::format_bytes(self.retained), + ); + } + + /// Calculate fragmentation ratio (1.0 = no fragmentation) + pub fn fragmentation_ratio(&self) -> f64 { + if self.allocated > 0 { + self.active as f64 / self.allocated as f64 + } else { + 1.0 + } + } +} + +/// Log jemalloc stats if available +pub fn log_jemalloc_stats() { + if let Some(stats) = get_jemalloc_stats() { + stats.log(); + let frag = stats.fragmentation_ratio(); + if frag > 1.5 { + warn!("[JEMALLOC] High fragmentation detected: {:.2}x", frag); + } + } +} + +#[derive(Debug)] +pub struct MemoryCheckpoint { + pub name: String, + pub stats: MemoryStats, +} + +impl MemoryCheckpoint { + pub fn new(name: &str) -> Self { + let stats = MemoryStats::current(); + info!( + "[CHECKPOINT] {} started at RSS={}", + name, + MemoryStats::format_bytes(stats.rss_bytes) + ); + Self { + name: name.to_string(), + stats, + } + } + + pub fn compare_and_log(&self) { + let current = MemoryStats::current(); + let diff = current.rss_bytes as i64 - self.stats.rss_bytes as i64; + + if diff > 0 { + warn!( + "[CHECKPOINT] {} INCREASED by {}", + self.name, + MemoryStats::format_bytes(diff as u64), + ); + } else if diff < 0 { + info!( + "[CHECKPOINT] {} decreased by {}", + self.name, + MemoryStats::format_bytes((-diff) as u64), + ); + } else { + debug!("[CHECKPOINT] {} unchanged", self.name); + } + } +} + +pub struct ComponentMemoryTracker { + components: Mutex>>, + max_history: usize, +} + +impl ComponentMemoryTracker { + pub fn new(max_history: usize) -> Self { + Self { + components: Mutex::new(HashMap::new()), + max_history, + } + } + + pub fn record(&self, component: &str) { + let stats = MemoryStats::current(); + if let Ok(mut components) = self.components.lock() { + let history = components.entry(component.to_string()).or_default(); + history.push(stats); + + if history.len() > self.max_history { + history.remove(0); + } + } + } + + pub fn get_growth_rate(&self, component: &str) -> Option { + if let Ok(components) = self.components.lock() { + if let Some(history) = components.get(component) { + if history.len() >= 2 { + let first = &history[0]; + let last = &history[history.len() - 1]; + let duration = last.timestamp.duration_since(first.timestamp).as_secs_f64(); + if duration > 0.0 { + let byte_diff = last.rss_bytes as f64 - first.rss_bytes as f64; + return Some(byte_diff / duration); + } + } + } + } + None + } + + pub fn log_all(&self) { + if let Ok(components) = self.components.lock() { + for (name, history) in components.iter() { + if let Some(last) = history.last() { + let growth = self.get_growth_rate(name); + let growth_str = growth + .map(|g| { + let sign = if g >= 0.0 { "+" } else { "-" }; + format!("{}{}/s", sign, MemoryStats::format_bytes(g.abs() as u64)) + }) + .unwrap_or_else(|| "N/A".to_string()); + info!( + "[COMPONENT] {} | RSS={} | Growth={}", + name, + MemoryStats::format_bytes(last.rss_bytes), + growth_str + ); + } + } + } + } +} + +pub fn record_component(component: &str) { + COMPONENT_TRACKER.record(component); +} + +pub fn log_component_stats() { + COMPONENT_TRACKER.log_all(); +} + +pub struct LeakDetector { + baseline: Mutex, + growth_threshold_bytes: u64, + consecutive_growth_count: Mutex, + max_consecutive_growth: usize, +} + +impl LeakDetector { + pub fn new(growth_threshold_mb: u64, max_consecutive_growth: usize) -> Self { + Self { + baseline: Mutex::new(0), + growth_threshold_bytes: growth_threshold_mb * 1024 * 1024, + consecutive_growth_count: Mutex::new(0), + max_consecutive_growth, + } + } + + pub fn reset_baseline(&self) { + let current = MemoryStats::current(); + if let Ok(mut baseline) = self.baseline.lock() { + *baseline = current.rss_bytes; + } + if let Ok(mut count) = self.consecutive_growth_count.lock() { + *count = 0; + } + } + + pub fn check(&self) -> Option { + let current = MemoryStats::current(); + + let baseline_val = match self.baseline.lock() { + Ok(b) => *b, + Err(_) => return None, + }; + + if baseline_val == 0 { + if let Ok(mut baseline) = self.baseline.lock() { + *baseline = current.rss_bytes; + } + return None; + } + + let growth = current.rss_bytes.saturating_sub(baseline_val); + + if growth > self.growth_threshold_bytes { + let count = match self.consecutive_growth_count.lock() { + Ok(mut c) => { + *c += 1; + *c + } + Err(_) => return None, + }; + + if count >= self.max_consecutive_growth { + return Some(format!( + "POTENTIAL MEMORY LEAK: grew by {} over {} checks. RSS={}, Baseline={}", + MemoryStats::format_bytes(growth), + count, + MemoryStats::format_bytes(current.rss_bytes), + MemoryStats::format_bytes(baseline_val), + )); + } + } else { + if let Ok(mut count) = self.consecutive_growth_count.lock() { + *count = 0; + } + if let Ok(mut baseline) = self.baseline.lock() { + *baseline = current.rss_bytes; + } + } + + None + } +} + +pub fn start_memory_monitor(interval_secs: u64, warn_threshold_mb: u64) { + let detector = LeakDetector::new(warn_threshold_mb, 5); + + tokio::spawn(async move { + register_thread("memory-monitor", "monitoring"); + + info!( + "[MONITOR] Started (interval={}s, threshold={}MB)", + interval_secs, warn_threshold_mb + ); + + let mut prev_rss: u64 = 0; + let mut tick_count: u64 = 0; + + // First 2 minutes: check every 10 seconds for aggressive tracking + // After that: use normal interval + let startup_interval = Duration::from_secs(10); + let normal_interval = Duration::from_secs(interval_secs); + let startup_ticks = 12; // 2 minutes of 10-second intervals + + let mut interval = tokio::time::interval(startup_interval); + + loop { + interval.tick().await; + tick_count += 1; + record_thread_activity("memory-monitor"); + + let stats = MemoryStats::current(); + let rss_diff = if prev_rss > 0 { + stats.rss_bytes as i64 - prev_rss as i64 + } else { + 0 + }; + + let diff_str = if rss_diff > 0 { + format!("+{}", MemoryStats::format_bytes(rss_diff as u64)) + } else if rss_diff < 0 { + format!("-{}", MemoryStats::format_bytes((-rss_diff) as u64)) + } else { + "±0".to_string() + }; + + trace!( + "[MONITOR] tick={} RSS={} ({}) Virtual={}", + tick_count, + MemoryStats::format_bytes(stats.rss_bytes), + diff_str, + MemoryStats::format_bytes(stats.virtual_bytes), + ); + + // Log jemalloc stats every 5 ticks if available + if tick_count % 5 == 0 { + log_jemalloc_stats(); + } + + prev_rss = stats.rss_bytes; + record_component("global"); + + if let Some(warning) = detector.check() { + warn!("[MONITOR] {}", warning); + stats.log(); + log_component_stats(); + log_thread_stats(); + } + + // Switch to normal interval after startup period + if tick_count == startup_ticks { + trace!("[MONITOR] Switching to normal interval ({}s)", interval_secs); + interval = tokio::time::interval(normal_interval); + } + } + }); +} + +pub fn get_process_memory() -> Option<(u64, u64)> { + let pid = Pid::from_u32(std::process::id()); + let mut sys = System::new(); + sys.refresh_processes(ProcessesToUpdate::Some(&[pid]), true); + + sys.process(pid).map(|p| (p.memory(), p.virtual_memory())) +} + +pub fn log_process_memory() { + if let Some((rss, virt)) = get_process_memory() { + trace!( + "[PROCESS] RSS={}, Virtual={}", + MemoryStats::format_bytes(rss), + MemoryStats::format_bytes(virt) + ); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_stats() { + let stats = MemoryStats::current(); + assert!(stats.rss_bytes > 0 || stats.virtual_bytes >= 0); + } + + #[test] + fn test_format_bytes() { + assert_eq!(MemoryStats::format_bytes(500), "500 B"); + assert_eq!(MemoryStats::format_bytes(1024), "1.00 KB"); + assert_eq!(MemoryStats::format_bytes(1024 * 1024), "1.00 MB"); + assert_eq!(MemoryStats::format_bytes(1024 * 1024 * 1024), "1.00 GB"); + } + + #[test] + fn test_checkpoint() { + let checkpoint = MemoryCheckpoint::new("test"); + checkpoint.compare_and_log(); + } + + #[test] + fn test_thread_registry() { + register_thread("test-thread", "test-component"); + record_thread_activity("test-thread"); + log_thread_stats(); + unregister_thread("test-thread"); + } +} diff --git a/src/core/shared/mod.rs b/src/core/shared/mod.rs index ba1da5236..9d9f67129 100644 --- a/src/core/shared/mod.rs +++ b/src/core/shared/mod.rs @@ -6,6 +6,7 @@ pub mod admin; pub mod analytics; pub mod enums; +pub mod memory_monitor; pub mod models; pub mod schema; pub mod state; diff --git a/src/core/shared/state.rs b/src/core/shared/state.rs index 6b238f7c4..ce387874f 100644 --- a/src/core/shared/state.rs +++ b/src/core/shared/state.rs @@ -42,6 +42,192 @@ pub struct AttendantNotification { pub priority: i32, } +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct AgentActivity { + pub phase: String, + pub items_processed: u32, + #[serde(skip_serializing_if = "Option::is_none")] + pub items_total: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub speed_per_min: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub eta_seconds: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub current_item: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub bytes_processed: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tokens_used: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub files_created: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub tables_created: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub log_lines: Option>, +} + +impl AgentActivity { + pub fn new(phase: impl Into) -> Self { + Self { + phase: phase.into(), + items_processed: 0, + items_total: None, + speed_per_min: None, + eta_seconds: None, + current_item: None, + bytes_processed: None, + tokens_used: None, + files_created: None, + tables_created: None, + log_lines: None, + } + } + + #[must_use] + pub fn with_progress(mut self, processed: u32, total: Option) -> Self { + self.items_processed = processed; + self.items_total = total; + self + } + + #[must_use] + pub fn with_speed(mut self, speed: f32, eta_seconds: Option) -> Self { + self.speed_per_min = Some(speed); + self.eta_seconds = eta_seconds; + self + } + + #[must_use] + pub fn with_current_item(mut self, item: impl Into) -> Self { + self.current_item = Some(item.into()); + self + } + + #[must_use] + pub fn with_bytes(mut self, bytes: u64) -> Self { + self.bytes_processed = Some(bytes); + self + } + + #[must_use] + pub fn with_tokens(mut self, tokens: u32) -> Self { + self.tokens_used = Some(tokens); + self + } + + #[must_use] + pub fn with_files(mut self, files: Vec) -> Self { + self.files_created = Some(files); + self + } + + #[must_use] + pub fn with_tables(mut self, tables: Vec) -> Self { + self.tables_created = Some(tables); + self + } + + #[must_use] + pub fn with_log_lines(mut self, lines: Vec) -> Self { + self.log_lines = Some(lines); + self + } + + #[must_use] + pub fn add_log_line(mut self, line: impl Into) -> Self { + let lines = self.log_lines.get_or_insert_with(Vec::new); + lines.push(line.into()); + self + } +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct TaskProgressEvent { + #[serde(rename = "type")] + pub event_type: String, + pub task_id: String, + pub step: String, + pub message: String, + pub progress: u8, + pub total_steps: u8, + pub current_step: u8, + pub timestamp: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub details: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub activity: Option, +} + +impl TaskProgressEvent { + pub fn new(task_id: impl Into, step: impl Into, message: impl Into) -> Self { + Self { + event_type: "task_progress".to_string(), + task_id: task_id.into(), + step: step.into(), + message: message.into(), + progress: 0, + total_steps: 0, + current_step: 0, + timestamp: chrono::Utc::now().to_rfc3339(), + details: None, + error: None, + activity: None, + } + } + + #[must_use] + pub fn with_progress(mut self, current: u8, total: u8) -> Self { + self.current_step = current; + self.total_steps = total; + self.progress = if total > 0 { (current * 100) / total } else { 0 }; + self + } + + #[must_use] + pub fn with_details(mut self, details: impl Into) -> Self { + self.details = Some(details.into()); + self + } + + #[must_use] + pub fn with_activity(mut self, activity: AgentActivity) -> Self { + self.activity = Some(activity); + self + } + + #[must_use] + pub fn with_error(mut self, error: impl Into) -> Self { + self.event_type = "task_error".to_string(); + self.error = Some(error.into()); + self + } + + #[must_use] + pub fn completed(mut self) -> Self { + self.event_type = "task_completed".to_string(); + self.progress = 100; + self + } + + pub fn started(task_id: impl Into, message: impl Into, total_steps: u8) -> Self { + Self { + event_type: "task_started".to_string(), + task_id: task_id.into(), + step: "init".to_string(), + message: message.into(), + progress: 0, + total_steps, + current_step: 0, + timestamp: chrono::Utc::now().to_rfc3339(), + details: None, + error: None, + activity: None, + } + } +} + #[derive(Clone, Default)] pub struct Extensions { map: Arc>>>, @@ -129,6 +315,7 @@ pub struct AppState { pub task_engine: Arc, pub extensions: Extensions, pub attendant_broadcast: Option>, + pub task_progress_broadcast: Option>, } impl Clone for AppState { @@ -158,6 +345,7 @@ impl Clone for AppState { task_engine: Arc::clone(&self.task_engine), extensions: self.extensions.clone(), attendant_broadcast: self.attendant_broadcast.clone(), + task_progress_broadcast: self.task_progress_broadcast.clone(), } } } @@ -198,10 +386,95 @@ impl std::fmt::Debug for AppState { .field("task_engine", &"Arc") .field("extensions", &self.extensions) .field("attendant_broadcast", &self.attendant_broadcast.is_some()) + .field("task_progress_broadcast", &self.task_progress_broadcast.is_some()) .finish() } } +impl AppState { + pub fn broadcast_task_progress(&self, event: TaskProgressEvent) { + log::info!( + "[TASK_PROGRESS] Broadcasting: task_id={}, step={}, message={}", + event.task_id, + event.step, + event.message + ); + if let Some(tx) = &self.task_progress_broadcast { + let receiver_count = tx.receiver_count(); + log::info!("[TASK_PROGRESS] Broadcast channel has {} receivers", receiver_count); + match tx.send(event) { + Ok(_) => { + log::info!("[TASK_PROGRESS] Event sent successfully"); + } + Err(e) => { + log::warn!("[TASK_PROGRESS] No listeners for task progress: {e}"); + } + } + } else { + log::warn!("[TASK_PROGRESS] No broadcast channel configured!"); + } + } + + pub fn emit_progress( + &self, + task_id: &str, + step: &str, + message: &str, + current: u8, + total: u8, + ) { + let event = TaskProgressEvent::new(task_id, step, message) + .with_progress(current, total); + self.broadcast_task_progress(event); + } + + pub fn emit_progress_with_details( + &self, + task_id: &str, + step: &str, + message: &str, + current: u8, + total: u8, + details: &str, + ) { + let event = TaskProgressEvent::new(task_id, step, message) + .with_progress(current, total) + .with_details(details); + self.broadcast_task_progress(event); + } + + pub fn emit_activity( + &self, + task_id: &str, + step: &str, + message: &str, + current: u8, + total: u8, + activity: AgentActivity, + ) { + let event = TaskProgressEvent::new(task_id, step, message) + .with_progress(current, total) + .with_activity(activity); + self.broadcast_task_progress(event); + } + + pub fn emit_task_started(&self, task_id: &str, message: &str, total_steps: u8) { + let event = TaskProgressEvent::started(task_id, message, total_steps); + self.broadcast_task_progress(event); + } + + pub fn emit_task_completed(&self, task_id: &str, message: &str) { + let event = TaskProgressEvent::new(task_id, "complete", message).completed(); + self.broadcast_task_progress(event); + } + + pub fn emit_task_error(&self, task_id: &str, step: &str, error: &str) { + let event = TaskProgressEvent::new(task_id, step, "Task failed") + .with_error(error); + self.broadcast_task_progress(event); + } +} + #[cfg(test)] impl Default for AppState { fn default() -> Self { @@ -212,6 +485,7 @@ impl Default for AppState { let pool = Pool::builder() .max_size(1) .test_on_check_out(false) + .connection_timeout(std::time::Duration::from_secs(5)) .build(manager) .expect("Failed to create test database pool"); @@ -219,6 +493,7 @@ impl Default for AppState { let session_manager = SessionManager::new(conn, None); let (attendant_tx, _) = broadcast::channel(100); + let (task_progress_tx, _) = broadcast::channel(100); Self { #[cfg(feature = "drive")] @@ -245,6 +520,7 @@ impl Default for AppState { task_engine: Arc::new(TaskEngine::new(pool)), extensions: Extensions::new(), attendant_broadcast: Some(attendant_tx), + task_progress_broadcast: Some(task_progress_tx), } } } diff --git a/src/core/shared/test_utils.rs b/src/core/shared/test_utils.rs index e8f1c24fd..0cfd27cde 100644 --- a/src/core/shared/test_utils.rs +++ b/src/core/shared/test_utils.rs @@ -178,6 +178,7 @@ impl TestAppStateBuilder { let pool = Pool::builder() .max_size(1) .test_on_check_out(false) + .connection_timeout(std::time::Duration::from_secs(5)) .build(manager)?; let conn = pool.get()?; @@ -240,7 +241,10 @@ pub fn create_test_db_pool() -> Result::new(&database_url); - let pool = Pool::builder().max_size(1).build(manager)?; + let pool = Pool::builder() + .max_size(1) + .connection_timeout(std::time::Duration::from_secs(5)) + .build(manager)?; Ok(pool) } diff --git a/src/core/shared/utils.rs b/src/core/shared/utils.rs index bcc22efd8..fc755ed69 100644 --- a/src/core/shared/utils.rs +++ b/src/core/shared/utils.rs @@ -2,6 +2,8 @@ use crate::config::DriveConfig; use crate::core::secrets::SecretsManager; use anyhow::{Context, Result}; use aws_config::BehaviorVersion; +use aws_config::retry::RetryConfig; +use aws_config::timeout::TimeoutConfig; use aws_sdk_s3::{config::Builder as S3ConfigBuilder, Client as S3Client}; use diesel::Connection; use diesel::{ @@ -101,18 +103,33 @@ pub async fn create_s3_operator( (config.access_key.clone(), config.secret_key.clone()) }; + // Set CA cert for self-signed TLS (dev stack) if std::path::Path::new(CA_CERT_PATH).exists() { std::env::set_var("AWS_CA_BUNDLE", CA_CERT_PATH); std::env::set_var("SSL_CERT_FILE", CA_CERT_PATH); debug!("Set AWS_CA_BUNDLE and SSL_CERT_FILE to {} for S3 client", CA_CERT_PATH); } + // Configure timeouts to prevent memory leaks on connection failures + let timeout_config = TimeoutConfig::builder() + .connect_timeout(Duration::from_secs(5)) + .read_timeout(Duration::from_secs(30)) + .operation_timeout(Duration::from_secs(30)) + .operation_attempt_timeout(Duration::from_secs(15)) + .build(); + + // Limit retries to prevent 100% CPU on connection failures + let retry_config = RetryConfig::standard() + .with_max_attempts(2); + let base_config = aws_config::defaults(BehaviorVersion::latest()) .endpoint_url(endpoint) .region("auto") .credentials_provider(aws_sdk_s3::config::Credentials::new( access_key, secret_key, None, None, "static", )) + .timeout_config(timeout_config) + .retry_config(retry_config) .load() .await; let s3_config = S3ConfigBuilder::from(&base_config) @@ -261,6 +278,11 @@ pub fn create_conn() -> Result { let database_url = get_database_url_sync()?; let manager = ConnectionManager::::new(database_url); Pool::builder() + .max_size(10) + .min_idle(Some(1)) + .connection_timeout(std::time::Duration::from_secs(5)) + .idle_timeout(Some(std::time::Duration::from_secs(300))) + .max_lifetime(Some(std::time::Duration::from_secs(1800))) .build(manager) .map_err(|e| anyhow::anyhow!("Failed to create database pool: {}", e)) } @@ -269,6 +291,11 @@ pub async fn create_conn_async() -> Result { let database_url = get_database_url().await?; let manager = ConnectionManager::::new(database_url); Pool::builder() + .max_size(10) + .min_idle(Some(1)) + .connection_timeout(std::time::Duration::from_secs(5)) + .idle_timeout(Some(std::time::Duration::from_secs(300))) + .max_lifetime(Some(std::time::Duration::from_secs(1800))) .build(manager) .map_err(|e| anyhow::anyhow!("Failed to create database pool: {}", e)) } diff --git a/src/drive/drive_monitor/mod.rs b/src/drive/drive_monitor/mod.rs index 5551c1fb8..365b08d64 100644 --- a/src/drive/drive_monitor/mod.rs +++ b/src/drive/drive_monitor/mod.rs @@ -1,16 +1,22 @@ use crate::basic::compiler::BasicCompiler; use crate::config::ConfigManager; +use crate::core::kb::embedding_generator::is_embedding_server_ready; use crate::core::kb::KnowledgeBaseManager; +use crate::core::shared::memory_monitor::{log_jemalloc_stats, MemoryStats}; use crate::shared::message_types::MessageType; use crate::shared::state::AppState; use aws_sdk_s3::Client; -use log::{debug, error, info}; +use log::{debug, error, info, trace, warn}; use std::collections::HashMap; use std::error::Error; use std::path::PathBuf; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::sync::Arc; -use tokio::time::{interval, Duration}; +use tokio::time::Duration; + +const KB_INDEXING_TIMEOUT_SECS: u64 = 60; +const MAX_BACKOFF_SECS: u64 = 300; +const INITIAL_BACKOFF_SECS: u64 = 30; #[derive(Debug, Clone)] pub struct FileState { pub etag: String, @@ -24,6 +30,7 @@ pub struct DriveMonitor { kb_manager: Arc, work_root: PathBuf, is_processing: Arc, + consecutive_failures: Arc, } impl DriveMonitor { pub fn new(state: Arc, bucket_name: String, bot_id: uuid::Uuid) -> Self { @@ -38,29 +45,105 @@ impl DriveMonitor { kb_manager, work_root, is_processing: Arc::new(AtomicBool::new(false)), + consecutive_failures: Arc::new(AtomicU32::new(0)), } } + async fn check_drive_health(&self) -> bool { + let Some(client) = &self.state.drive else { + return false; + }; + + match tokio::time::timeout( + Duration::from_secs(5), + client.head_bucket().bucket(&self.bucket_name).send(), + ) + .await + { + Ok(Ok(_)) => true, + Ok(Err(e)) => { + debug!("[DRIVE_MONITOR] Health check failed: {}", e); + false + } + Err(_) => { + debug!("[DRIVE_MONITOR] Health check timed out"); + false + } + } + } + + fn calculate_backoff(&self) -> Duration { + let failures = self.consecutive_failures.load(Ordering::Relaxed); + if failures == 0 { + return Duration::from_secs(INITIAL_BACKOFF_SECS); + } + let backoff_secs = INITIAL_BACKOFF_SECS * (1u64 << failures.min(4)); + Duration::from_secs(backoff_secs.min(MAX_BACKOFF_SECS)) + } + pub async fn start_monitoring(&self) -> Result<(), Box> { - info!("Starting DriveMonitor for bot {}", self.bot_id); + trace!("[PROFILE] start_monitoring ENTER"); + let start_mem = MemoryStats::current(); + trace!("[DRIVE_MONITOR] Starting DriveMonitor for bot {}, RSS={}", + self.bot_id, MemoryStats::format_bytes(start_mem.rss_bytes)); + + if !self.check_drive_health().await { + warn!("[DRIVE_MONITOR] S3/MinIO not available for bucket {}, will retry with backoff", + self.bucket_name); + } self.is_processing .store(true, std::sync::atomic::Ordering::SeqCst); - self.check_for_changes().await?; + trace!("[PROFILE] start_monitoring: calling check_for_changes..."); + info!("[DRIVE_MONITOR] Calling initial check_for_changes..."); + + match self.check_for_changes().await { + Ok(_) => { + self.consecutive_failures.store(0, Ordering::Relaxed); + } + Err(e) => { + warn!("[DRIVE_MONITOR] Initial check failed (will retry): {}", e); + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + } + } + trace!("[PROFILE] start_monitoring: check_for_changes returned"); + + let after_initial = MemoryStats::current(); + trace!("[DRIVE_MONITOR] After initial check, RSS={} (delta={})", + MemoryStats::format_bytes(after_initial.rss_bytes), + MemoryStats::format_bytes(after_initial.rss_bytes.saturating_sub(start_mem.rss_bytes))); let self_clone = Arc::new(self.clone()); tokio::spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(30)); - while self_clone .is_processing .load(std::sync::atomic::Ordering::SeqCst) { - interval.tick().await; + let backoff = self_clone.calculate_backoff(); + tokio::time::sleep(backoff).await; - if let Err(e) = self_clone.check_for_changes().await { - error!("Error during sync for bot {}: {}", self_clone.bot_id, e); + if !self_clone.check_drive_health().await { + let failures = self_clone.consecutive_failures.fetch_add(1, Ordering::Relaxed) + 1; + if failures % 10 == 1 { + warn!("[DRIVE_MONITOR] S3/MinIO unavailable for bucket {} (failures: {}), backing off to {:?}", + self_clone.bucket_name, failures, self_clone.calculate_backoff()); + } + continue; + } + + match self_clone.check_for_changes().await { + Ok(_) => { + let prev_failures = self_clone.consecutive_failures.swap(0, Ordering::Relaxed); + if prev_failures > 0 { + info!("[DRIVE_MONITOR] S3/MinIO recovered for bucket {} after {} failures", + self_clone.bucket_name, prev_failures); + } + } + Err(e) => { + self_clone.consecutive_failures.fetch_add(1, Ordering::Relaxed); + error!("Error during sync for bot {}: {}", self_clone.bot_id, e); + } } } }); @@ -76,6 +159,7 @@ impl DriveMonitor { .store(false, std::sync::atomic::Ordering::SeqCst); self.file_states.write().await.clear(); + self.consecutive_failures.store(0, Ordering::Relaxed); info!("DriveMonitor stopped for bot {}", self.bot_id); Ok(()) @@ -86,9 +170,9 @@ impl DriveMonitor { "Drive Monitor service started for bucket: {}", self.bucket_name ); - let mut tick = interval(Duration::from_secs(90)); loop { - tick.tick().await; + let backoff = self.calculate_backoff(); + tokio::time::sleep(backoff).await; if self.is_processing.load(Ordering::Acquire) { log::warn!( @@ -97,10 +181,29 @@ impl DriveMonitor { continue; } + if !self.check_drive_health().await { + let failures = self.consecutive_failures.fetch_add(1, Ordering::Relaxed) + 1; + if failures % 10 == 1 { + warn!("[DRIVE_MONITOR] S3/MinIO unavailable for bucket {} (failures: {}), backing off to {:?}", + self.bucket_name, failures, self.calculate_backoff()); + } + continue; + } + self.is_processing.store(true, Ordering::Release); - if let Err(e) = self.check_for_changes().await { - log::error!("Error checking for drive changes: {}", e); + match self.check_for_changes().await { + Ok(_) => { + let prev_failures = self.consecutive_failures.swap(0, Ordering::Relaxed); + if prev_failures > 0 { + info!("[DRIVE_MONITOR] S3/MinIO recovered for bucket {} after {} failures", + self.bucket_name, prev_failures); + } + } + Err(e) => { + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + log::error!("Error checking for drive changes: {}", e); + } } self.is_processing.store(false, Ordering::Release); @@ -108,12 +211,52 @@ impl DriveMonitor { }) } async fn check_for_changes(&self) -> Result<(), Box> { + trace!("[PROFILE] check_for_changes ENTER"); + let start_mem = MemoryStats::current(); + trace!("[DRIVE_MONITOR] check_for_changes START, RSS={}", + MemoryStats::format_bytes(start_mem.rss_bytes)); + let Some(client) = &self.state.drive else { + trace!("[PROFILE] check_for_changes: no drive client, returning"); return Ok(()); }; + + trace!("[PROFILE] check_for_changes: calling check_gbdialog_changes..."); + trace!("[DRIVE_MONITOR] Checking gbdialog..."); self.check_gbdialog_changes(client).await?; + trace!("[PROFILE] check_for_changes: check_gbdialog_changes done"); + let after_dialog = MemoryStats::current(); + trace!("[DRIVE_MONITOR] After gbdialog, RSS={} (delta={})", + MemoryStats::format_bytes(after_dialog.rss_bytes), + MemoryStats::format_bytes(after_dialog.rss_bytes.saturating_sub(start_mem.rss_bytes))); + + trace!("[PROFILE] check_for_changes: calling check_gbot..."); + trace!("[DRIVE_MONITOR] Checking gbot..."); self.check_gbot(client).await?; + trace!("[PROFILE] check_for_changes: check_gbot done"); + let after_gbot = MemoryStats::current(); + trace!("[DRIVE_MONITOR] After gbot, RSS={} (delta={})", + MemoryStats::format_bytes(after_gbot.rss_bytes), + MemoryStats::format_bytes(after_gbot.rss_bytes.saturating_sub(after_dialog.rss_bytes))); + + trace!("[PROFILE] check_for_changes: calling check_gbkb_changes..."); + trace!("[DRIVE_MONITOR] Checking gbkb..."); self.check_gbkb_changes(client).await?; + trace!("[PROFILE] check_for_changes: check_gbkb_changes done"); + let after_gbkb = MemoryStats::current(); + trace!("[DRIVE_MONITOR] After gbkb, RSS={} (delta={})", + MemoryStats::format_bytes(after_gbkb.rss_bytes), + MemoryStats::format_bytes(after_gbkb.rss_bytes.saturating_sub(after_gbot.rss_bytes))); + + log_jemalloc_stats(); + + let total_delta = after_gbkb.rss_bytes.saturating_sub(start_mem.rss_bytes); + if total_delta > 50 * 1024 * 1024 { + warn!("[DRIVE_MONITOR] check_for_changes grew by {} - potential leak!", + MemoryStats::format_bytes(total_delta)); + } + + trace!("[PROFILE] check_for_changes EXIT"); Ok(()) } async fn check_gbdialog_changes( @@ -188,6 +331,7 @@ impl DriveMonitor { Ok(()) } async fn check_gbot(&self, client: &Client) -> Result<(), Box> { + trace!("[PROFILE] check_gbot ENTER"); let config_manager = ConfigManager::new(self.state.conn.clone()); debug!("check_gbot: Checking bucket {} for config.csv changes", self.bucket_name); let mut continuation_token = None; @@ -333,6 +477,7 @@ impl DriveMonitor { } continuation_token = list_objects.next_continuation_token; } + trace!("[PROFILE] check_gbot EXIT"); Ok(()) } async fn broadcast_theme_change( @@ -467,6 +612,7 @@ impl DriveMonitor { &self, client: &Client, ) -> Result<(), Box> { + trace!("[PROFILE] check_gbkb_changes ENTER"); let bot_name = self .bucket_name .strip_suffix(".gbai") @@ -510,6 +656,12 @@ impl DriveMonitor { continue; } + let size = obj.size().unwrap_or(0); + if size == 0 { + trace!("Skipping 0-byte file in .gbkb: {}", path); + continue; + } + let file_state = FileState { etag: obj.e_tag().unwrap_or_default().to_string(), }; @@ -561,9 +713,6 @@ impl DriveMonitor { } let path_parts: Vec<&str> = path.split('/').collect(); - // path_parts: [0] = "bot.gbkb", [1] = folder or file, [2+] = nested files - // Skip files directly in .gbkb root (path_parts.len() == 2 means root file) - // Only process files inside subfolders (path_parts.len() >= 3) if path_parts.len() >= 3 { let kb_name = path_parts[1]; let kb_folder_path = self @@ -572,30 +721,59 @@ impl DriveMonitor { .join(&gbkb_prefix) .join(kb_name); - info!( - "Triggering KB indexing for folder: {} (PDF text extraction enabled)", - kb_folder_path.display() - ); - match self - .kb_manager - .handle_gbkb_change(bot_name, &kb_folder_path) - .await - { - Ok(_) => { - debug!( - "Successfully processed KB change for {}/{}", - bot_name, kb_name - ); - } - Err(e) => { - log::error!( - "Failed to process .gbkb change for {}/{}: {}", - bot_name, - kb_name, - e - ); - } + let kb_indexing_disabled = std::env::var("DISABLE_KB_INDEXING") + .map(|v| v == "true" || v == "1") + .unwrap_or(false); + + if kb_indexing_disabled { + debug!("KB indexing disabled via DISABLE_KB_INDEXING, skipping {}", kb_folder_path.display()); + continue; } + + if !is_embedding_server_ready() { + info!("[DRIVE_MONITOR] Embedding server not ready, deferring KB indexing for {}", kb_folder_path.display()); + continue; + } + + let kb_manager = Arc::clone(&self.kb_manager); + let bot_name_owned = bot_name.to_string(); + let kb_name_owned = kb_name.to_string(); + let kb_folder_owned = kb_folder_path.clone(); + + tokio::spawn(async move { + info!( + "Triggering KB indexing for folder: {} (PDF text extraction enabled)", + kb_folder_owned.display() + ); + + match tokio::time::timeout( + Duration::from_secs(KB_INDEXING_TIMEOUT_SECS), + kb_manager.handle_gbkb_change(&bot_name_owned, &kb_folder_owned) + ).await { + Ok(Ok(_)) => { + debug!( + "Successfully processed KB change for {}/{}", + bot_name_owned, kb_name_owned + ); + } + Ok(Err(e)) => { + log::error!( + "Failed to process .gbkb change for {}/{}: {}", + bot_name_owned, + kb_name_owned, + e + ); + } + Err(_) => { + log::error!( + "KB indexing timed out after {}s for {}/{}", + KB_INDEXING_TIMEOUT_SECS, + bot_name_owned, + kb_name_owned + ); + } + } + }); } } } @@ -640,6 +818,7 @@ impl DriveMonitor { } } + trace!("[PROFILE] check_gbkb_changes EXIT"); Ok(()) } diff --git a/src/drive/mod.rs b/src/drive/mod.rs index f00c19418..91daa93c3 100644 --- a/src/drive/mod.rs +++ b/src/drive/mod.rs @@ -8,7 +8,7 @@ use axum::{ routing::{get, post}, Router, }; - +use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; use serde::{Deserialize, Serialize}; use std::sync::Arc; @@ -169,61 +169,82 @@ pub struct RestoreResponse { } #[allow(unused)] +#[derive(Debug, Serialize)] +pub struct BucketInfo { + pub name: String, + pub is_gbai: bool, +} + pub fn configure() -> Router> { Router::new() - .route("/files/list", get(list_files)) - .route("/files/read", post(read_file)) - .route("/files/write", post(write_file)) - .route("/files/save", post(write_file)) - .route("/files/getContents", post(read_file)) - .route("/files/delete", post(delete_file)) - .route("/files/upload", post(upload_file_to_drive)) - .route("/files/download", post(download_file)) - .route("/files/copy", post(copy_file)) - .route("/files/move", post(move_file)) - .route("/files/createFolder", post(create_folder)) - .route("/files/create-folder", post(create_folder)) - .route("/files/dirFolder", post(list_folder_contents)) - .route("/files/search", get(search_files)) - .route("/files/recent", get(recent_files)) - .route("/files/favorite", get(list_favorites)) - .route("/files/shareFolder", post(share_folder)) - .route("/files/shared", get(list_shared)) - .route("/files/permissions", get(get_permissions)) - .route("/files/quota", get(get_quota)) - .route("/files/sync/status", get(sync_status)) - .route("/files/sync/start", post(start_sync)) - .route("/files/sync/stop", post(stop_sync)) - .route("/files/versions", get(list_versions)) - .route("/files/restore", post(restore_version)) - .route("/docs/merge", post(document_processing::merge_documents)) - .route("/docs/convert", post(document_processing::convert_document)) - .route("/docs/fill", post(document_processing::fill_document)) - .route("/docs/export", post(document_processing::export_document)) - .route("/docs/import", post(document_processing::import_document)) + .route("/api/files/buckets", get(list_buckets)) + .route("/api/files/list", get(list_files)) + .route("/api/files/read", post(read_file)) + .route("/api/files/write", post(write_file)) + .route("/api/files/save", post(write_file)) + .route("/api/files/getContents", post(read_file)) + .route("/api/files/delete", post(delete_file)) + .route("/api/files/upload", post(upload_file_to_drive)) + .route("/api/files/download", post(download_file)) + .route("/api/files/copy", post(copy_file)) + .route("/api/files/move", post(move_file)) + .route("/api/files/createFolder", post(create_folder)) + .route("/api/files/create-folder", post(create_folder)) + .route("/api/files/dirFolder", post(list_folder_contents)) + .route("/api/files/search", get(search_files)) + .route("/api/files/recent", get(recent_files)) + .route("/api/files/favorite", get(list_favorites)) + .route("/api/files/shareFolder", post(share_folder)) + .route("/api/files/shared", get(list_shared)) + .route("/api/files/permissions", get(get_permissions)) + .route("/api/files/quota", get(get_quota)) + .route("/api/files/sync/status", get(sync_status)) + .route("/api/files/sync/start", post(start_sync)) + .route("/api/files/sync/stop", post(stop_sync)) + .route("/api/files/versions", get(list_versions)) + .route("/api/files/restore", post(restore_version)) + .route("/api/docs/merge", post(document_processing::merge_documents)) + .route("/api/docs/convert", post(document_processing::convert_document)) + .route("/api/docs/fill", post(document_processing::fill_document)) + .route("/api/docs/export", post(document_processing::export_document)) + .route("/api/docs/import", post(document_processing::import_document)) +} + +pub async fn list_buckets( + State(state): State>, +) -> Result>, (StatusCode, Json)> { + let s3_client = state.drive.as_ref().ok_or_else(|| { + ( + StatusCode::SERVICE_UNAVAILABLE, + Json(serde_json::json!({"error": "S3 service not available"})), + ) + })?; + + let result = s3_client.list_buckets().send().await.map_err(|e| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({"error": format!("Failed to list buckets: {}", e)})), + ) + })?; + + let buckets: Vec = result + .buckets() + .iter() + .filter_map(|b| { + b.name().map(|name| BucketInfo { + name: name.to_string(), + is_gbai: name.to_lowercase().ends_with(".gbai"), + }) + }) + .collect(); + + Ok(Json(buckets)) } pub async fn list_files( State(state): State>, Query(params): Query, ) -> Result>, (StatusCode, Json)> { - #[cfg(feature = "console")] - let result = { - let mut tree = FileTree::new(state.clone()); - if let Some(bucket) = ¶ms.bucket { - if let Some(path) = ¶ms.path { - tree.enter_folder(bucket.clone(), path.clone()).await.ok(); - } else { - tree.enter_bucket(bucket.clone()).await.ok(); - } - } else { - tree.load_root().await.ok(); - } - - Ok::, (StatusCode, Json)>(vec![]) - }; - - #[cfg(not(feature = "console"))] let result: Result, (StatusCode, Json)> = { let s3_client = state.drive.as_ref().ok_or_else(|| { ( @@ -402,33 +423,117 @@ pub async fn write_file( State(state): State>, Json(req): Json, ) -> Result, (StatusCode, Json)> { + tracing::debug!( + "write_file called: bucket={}, path={}, content_len={}", + req.bucket, + req.path, + req.content.len() + ); + let s3_client = state.drive.as_ref().ok_or_else(|| { + tracing::error!("S3 client not available for write_file"); ( StatusCode::SERVICE_UNAVAILABLE, Json(serde_json::json!({ "error": "S3 service not available" })), ) })?; + // Try to decode as base64, otherwise use content directly + // Base64 content from file uploads won't have whitespace/newlines at start + // and will only contain valid base64 characters + let is_base64 = is_likely_base64(&req.content); + tracing::debug!("Content detected as base64: {}", is_base64); + + let body_bytes: Vec = if is_base64 { + match BASE64.decode(&req.content) { + Ok(decoded) => { + tracing::debug!("Base64 decoded successfully, size: {} bytes", decoded.len()); + decoded + } + Err(e) => { + tracing::warn!("Base64 decode failed ({}), using raw content", e); + req.content.clone().into_bytes() + } + } + } else { + req.content.into_bytes() + }; + + let sanitized_path = req.path + .replace("//", "/") + .trim_start_matches('/') + .to_string(); + + tracing::debug!("Writing {} bytes to {}/{}", body_bytes.len(), req.bucket, sanitized_path); + s3_client .put_object() .bucket(&req.bucket) - .key(&req.path) - .body(req.content.into_bytes().into()) + .key(&sanitized_path) + .body(body_bytes.into()) .send() .await .map_err(|e| { + tracing::error!("S3 put_object failed: {:?}", e); ( StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({ "error": format!("Failed to write file: {}", e) })), ) })?; + tracing::info!("File written successfully: {}/{}", req.bucket, sanitized_path); Ok(Json(SuccessResponse { success: true, message: Some("File written successfully".to_string()), })) } +/// Check if a string is likely base64 encoded content (from file upload) +/// Base64 from DataURL will be pure base64 without newlines at start +fn is_likely_base64(s: &str) -> bool { + // Empty or very short strings are not base64 uploads + if s.len() < 20 { + return false; + } + + // If it starts with common text patterns, it's not base64 + let trimmed = s.trim_start(); + if trimmed.starts_with('#') // Markdown, shell scripts + || trimmed.starts_with("//") // Comments + || trimmed.starts_with("/*") // C-style comments + || trimmed.starts_with('{') // JSON + || trimmed.starts_with('[') // JSON array + || trimmed.starts_with('<') // XML/HTML + || trimmed.starts_with(">, Json(req): Json, diff --git a/src/llm/claude.rs b/src/llm/claude.rs index 3726a68b5..4ed1239b6 100644 --- a/src/llm/claude.rs +++ b/src/llm/claude.rs @@ -3,10 +3,13 @@ use futures::StreamExt; use log::{info, trace, warn}; use serde::{Deserialize, Serialize}; use serde_json::Value; +use std::time::Duration; use tokio::sync::mpsc; use super::{llm_models::get_handler, LLMProvider}; +const LLM_TIMEOUT_SECS: u64 = 300; + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ClaudeMessage { pub role: String, @@ -74,8 +77,14 @@ impl ClaudeClient { pub fn new(base_url: String, deployment_name: Option) -> Self { let is_azure = base_url.contains("azure.com") || base_url.contains("openai.azure.com"); + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(LLM_TIMEOUT_SECS)) + .connect_timeout(Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); + Self { - client: reqwest::Client::new(), + client, base_url, deployment_name: deployment_name.unwrap_or_else(|| "claude-opus-4-5".to_string()), is_azure, @@ -83,8 +92,14 @@ impl ClaudeClient { } pub fn azure(endpoint: String, deployment_name: String) -> Self { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(LLM_TIMEOUT_SECS)) + .connect_timeout(Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); + Self { - client: reqwest::Client::new(), + client, base_url: endpoint, deployment_name, is_azure: true, @@ -93,7 +108,6 @@ impl ClaudeClient { fn build_url(&self) -> String { if self.is_azure { - // Azure Claude exposes Anthropic API directly at /v1/messages format!("{}/v1/messages", self.base_url.trim_end_matches('/')) } else { format!("{}/v1/messages", self.base_url.trim_end_matches('/')) @@ -103,8 +117,6 @@ impl ClaudeClient { fn build_headers(&self, api_key: &str) -> reqwest::header::HeaderMap { let mut headers = reqwest::header::HeaderMap::new(); - // Both Azure Claude and direct Anthropic use the same headers - // Azure Claude proxies the Anthropic API format if let Ok(val) = api_key.parse() { headers.insert("x-api-key", val); } @@ -119,17 +131,12 @@ impl ClaudeClient { headers } - /// Normalize role names for Claude API compatibility. - /// Claude only accepts "user" or "assistant" roles in messages. - /// - "episodic" and "compact" roles (conversation summaries) are converted to "user" with a context prefix - /// - "system" roles should be handled separately (not in messages array) - /// - Unknown roles default to "user" fn normalize_role(role: &str) -> Option<(String, bool)> { match role { "user" => Some(("user".to_string(), false)), "assistant" => Some(("assistant".to_string(), false)), - "system" => None, // System messages handled separately - "episodic" | "compact" => Some(("user".to_string(), true)), // Mark as context + "system" => None, + "episodic" | "compact" => Some(("user".to_string(), true)), _ => Some(("user".to_string(), false)), } } @@ -148,10 +155,9 @@ impl ClaudeClient { system_parts.push(context_data.to_string()); } - // Extract episodic memory content and add to system prompt for (role, content) in history { if role == "episodic" || role == "compact" { - system_parts.push(format!("[Previous conversation summary]: {}", content)); + system_parts.push(format!("[Previous conversation summary]: {content}")); } } @@ -171,7 +177,7 @@ impl ClaudeClient { content: content.clone(), }) } - _ => None, // Skip system, episodic, compact (already in system prompt) + _ => None, } }) .collect(); @@ -221,7 +227,6 @@ impl LLMProvider for ClaudeClient { .filter_map(|m| { let role = m["role"].as_str().unwrap_or("user"); let content = m["content"].as_str().unwrap_or(""); - // Skip system messages (handled separately), episodic/compact (context), and empty content if role == "system" || role == "episodic" || role == "compact" || content.is_empty() { None } else { @@ -244,7 +249,6 @@ impl LLMProvider for ClaudeClient { }] }; - // Ensure at least one user message exists if claude_messages.is_empty() && !prompt.is_empty() { claude_messages.push(ClaudeMessage { role: "user".to_string(), @@ -268,7 +272,6 @@ impl LLMProvider for ClaudeClient { let system = system_prompt.filter(|s| !s.is_empty()); - // Validate we have at least one message with content if claude_messages.is_empty() { return Err("Cannot send request to Claude: no messages with content".into()); } @@ -281,26 +284,58 @@ impl LLMProvider for ClaudeClient { stream: None, }; - info!("Claude request to {}: model={}", url, model_name); + info!("Claude request to {url}: model={model_name}"); trace!("Claude request body: {:?}", serde_json::to_string(&request)); - let response = self + let start_time = std::time::Instant::now(); + + let response = match self .client .post(&url) .headers(headers) .json(&request) .send() - .await?; + .await + { + Ok(resp) => resp, + Err(e) => { + let elapsed = start_time.elapsed(); + if e.is_timeout() { + warn!("Claude request timed out after {elapsed:?} (limit: {LLM_TIMEOUT_SECS}s)"); + return Err(format!("Claude request timed out after {LLM_TIMEOUT_SECS}s").into()); + } + warn!("Claude request failed after {elapsed:?}: {e}"); + return Err(e.into()); + } + }; + let elapsed = start_time.elapsed(); let status = response.status(); + if !status.is_success() { let error_text = response.text().await.unwrap_or_default(); - warn!("Claude API error ({}): {}", status, error_text); - return Err(format!("Claude API error ({}): {}", status, error_text).into()); + warn!("Claude API error ({status}) after {elapsed:?}: {error_text}"); + return Err(format!("Claude API error ({status}): {error_text}").into()); } - let result: ClaudeResponse = response.json().await?; + info!("Claude response received in {elapsed:?}, status={status}"); + + let result: ClaudeResponse = match response.json().await { + Ok(r) => r, + Err(e) => { + warn!("Failed to parse Claude response: {e}"); + return Err(format!("Failed to parse Claude response: {e}").into()); + } + }; + let raw_content = self.extract_text_from_response(&result); + let content_len = raw_content.len(); + + info!( + "Claude response parsed: id={}, stop_reason={:?}, content_length={content_len}", + result.id, + result.stop_reason + ); let handler = get_handler(model_name); let content = handler.process_content(&raw_content); @@ -338,7 +373,6 @@ impl LLMProvider for ClaudeClient { .filter_map(|m| { let role = m["role"].as_str().unwrap_or("user"); let content = m["content"].as_str().unwrap_or(""); - // Skip system messages (handled separately), episodic/compact (context), and empty content if role == "system" || role == "episodic" || role == "compact" || content.is_empty() { None } else { @@ -361,7 +395,6 @@ impl LLMProvider for ClaudeClient { }] }; - // Ensure at least one user message exists if claude_messages.is_empty() && !prompt.is_empty() { claude_messages.push(ClaudeMessage { role: "user".to_string(), @@ -385,7 +418,6 @@ impl LLMProvider for ClaudeClient { let system = system_prompt.filter(|s| !s.is_empty()); - // Validate we have at least one message with content if claude_messages.is_empty() { return Err("Cannot send streaming request to Claude: no messages with content".into()); } @@ -398,25 +430,42 @@ impl LLMProvider for ClaudeClient { stream: Some(true), }; - info!("Claude streaming request to {}: model={}", url, model_name); + info!("Claude streaming request to {url}: model={model_name}"); - let response = self + let start_time = std::time::Instant::now(); + + let response = match self .client .post(&url) .headers(headers) .json(&request) .send() - .await?; + .await + { + Ok(resp) => resp, + Err(e) => { + let elapsed = start_time.elapsed(); + if e.is_timeout() { + warn!("Claude streaming request timed out after {elapsed:?}"); + return Err(format!("Claude streaming request timed out after {LLM_TIMEOUT_SECS}s").into()); + } + warn!("Claude streaming request failed after {elapsed:?}: {e}"); + return Err(e.into()); + } + }; let status = response.status(); if !status.is_success() { let error_text = response.text().await.unwrap_or_default(); - warn!("Claude streaming API error ({}): {}", status, error_text); - return Err(format!("Claude streaming API error ({}): {}", status, error_text).into()); + warn!("Claude streaming API error ({status}): {error_text}"); + return Err(format!("Claude streaming API error ({status}): {error_text}").into()); } + info!("Claude streaming connection established in {:?}", start_time.elapsed()); + let handler = get_handler(model_name); let mut stream = response.bytes_stream(); + let mut total_chunks = 0; while let Some(chunk) = stream.next().await { let chunk = chunk?; @@ -439,6 +488,7 @@ impl LLMProvider for ClaudeClient { let processed = handler.process_content(&delta.text); if !processed.is_empty() { let _ = tx.send(processed).await; + total_chunks += 1; } } } @@ -448,6 +498,12 @@ impl LLMProvider for ClaudeClient { } } + info!( + "Claude streaming completed in {:?}, chunks={}", + start_time.elapsed(), + total_chunks + ); + Ok(()) } @@ -490,7 +546,6 @@ mod tests { "claude-opus-4-5".to_string(), ); let url = client.build_url(); - // Azure Claude uses Anthropic API format directly assert!(url.contains("/v1/messages")); } diff --git a/src/llm/local.rs b/src/llm/local.rs index 0a123bce5..a8b026671 100644 --- a/src/llm/local.rs +++ b/src/llm/local.rs @@ -1,8 +1,10 @@ use crate::config::ConfigManager; +use crate::core::kb::embedding_generator::set_embedding_server_ready; +use crate::core::shared::memory_monitor::{log_jemalloc_stats, MemoryStats}; use crate::shared::models::schema::bots::dsl::*; use crate::shared::state::AppState; use diesel::prelude::*; -use log::{error, info, warn}; +use log::{error, info, trace, warn}; use reqwest; use std::fmt::Write; use std::sync::Arc; @@ -11,7 +13,14 @@ use tokio; pub async fn ensure_llama_servers_running( app_state: Arc, ) -> Result<(), Box> { + trace!("[PROFILE] ensure_llama_servers_running ENTER"); + let start_mem = MemoryStats::current(); + trace!("[LLM_LOCAL] ensure_llama_servers_running START, RSS={}", + MemoryStats::format_bytes(start_mem.rss_bytes)); + log_jemalloc_stats(); + if std::env::var("SKIP_LLM_SERVER").is_ok() { + trace!("[PROFILE] SKIP_LLM_SERVER set, returning early"); info!("SKIP_LLM_SERVER set - skipping local LLM server startup (using mock/external LLM)"); return Ok(()); } @@ -74,6 +83,9 @@ pub async fn ensure_llama_servers_running( info!(" Embedding Model: {embedding_model}"); info!(" LLM Server Path: {llm_server_path}"); info!("Restarting any existing llama-server processes..."); + trace!("[PROFILE] About to pkill llama-server..."); + let before_pkill = MemoryStats::current(); + trace!("[LLM_LOCAL] Before pkill, RSS={}", MemoryStats::format_bytes(before_pkill.rss_bytes)); if let Err(e) = tokio::process::Command::new("sh") .arg("-c") @@ -85,6 +97,12 @@ pub async fn ensure_llama_servers_running( tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; info!("Existing llama-server processes terminated (if any)"); } + trace!("[PROFILE] pkill done"); + + let after_pkill = MemoryStats::current(); + trace!("[LLM_LOCAL] After pkill, RSS={} (delta={})", + MemoryStats::format_bytes(after_pkill.rss_bytes), + MemoryStats::format_bytes(after_pkill.rss_bytes.saturating_sub(before_pkill.rss_bytes))); let llm_running = if llm_url.starts_with("https://") { info!("Using external HTTPS LLM server, skipping local startup"); @@ -135,13 +153,27 @@ pub async fn ensure_llama_servers_running( task.await??; } info!("Waiting for servers to become ready..."); + trace!("[PROFILE] Starting wait loop for servers..."); + let before_wait = MemoryStats::current(); + trace!("[LLM_LOCAL] Before wait loop, RSS={}", MemoryStats::format_bytes(before_wait.rss_bytes)); + let mut llm_ready = llm_running || llm_model.is_empty(); let mut embedding_ready = embedding_running || embedding_model.is_empty(); let mut attempts = 0; let max_attempts = 120; while attempts < max_attempts && (!llm_ready || !embedding_ready) { + trace!("[PROFILE] Wait loop iteration {}", attempts); tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; + if attempts % 5 == 0 { + let loop_mem = MemoryStats::current(); + trace!("[LLM_LOCAL] Wait loop attempt {}, RSS={} (delta from start={})", + attempts, + MemoryStats::format_bytes(loop_mem.rss_bytes), + MemoryStats::format_bytes(loop_mem.rss_bytes.saturating_sub(before_wait.rss_bytes))); + log_jemalloc_stats(); + } + if attempts % 5 == 0 { info!( "Checking server health (attempt {}/{max_attempts})...", @@ -160,6 +192,7 @@ pub async fn ensure_llama_servers_running( if is_server_running(&embedding_url).await { info!("Embedding server ready at {embedding_url}"); embedding_ready = true; + set_embedding_server_ready(true); } else if attempts % 10 == 0 { warn!("Embedding server not ready yet at {embedding_url}"); @@ -185,11 +218,29 @@ pub async fn ensure_llama_servers_running( } if llm_ready && embedding_ready { info!("All llama.cpp servers are ready and responding!"); + if !embedding_model.is_empty() { + set_embedding_server_ready(true); + } + trace!("[PROFILE] Servers ready!"); + + let after_ready = MemoryStats::current(); + trace!("[LLM_LOCAL] Servers ready, RSS={} (delta from start={})", + MemoryStats::format_bytes(after_ready.rss_bytes), + MemoryStats::format_bytes(after_ready.rss_bytes.saturating_sub(start_mem.rss_bytes))); + log_jemalloc_stats(); let _llm_provider1 = Arc::new(crate::llm::OpenAIClient::new( llm_model.clone(), Some(llm_url.clone()), )); + + let end_mem = MemoryStats::current(); + trace!("[LLM_LOCAL] ensure_llama_servers_running END, RSS={} (total delta={})", + MemoryStats::format_bytes(end_mem.rss_bytes), + MemoryStats::format_bytes(end_mem.rss_bytes.saturating_sub(start_mem.rss_bytes))); + log_jemalloc_stats(); + + trace!("[PROFILE] ensure_llama_servers_running EXIT OK"); Ok(()) } else { let mut error_msg = "Servers failed to start within timeout:".to_string(); diff --git a/src/main.rs b/src/main.rs index aa36e201b..27119f327 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,11 @@ +// Use jemalloc as the global allocator when the feature is enabled +#[cfg(feature = "jemalloc")] +use tikv_jemallocator::Jemalloc; + +#[cfg(feature = "jemalloc")] +#[global_allocator] +static GLOBAL: Jemalloc = Jemalloc; + use axum::extract::{Extension, State}; use axum::http::StatusCode; use axum::middleware; @@ -25,6 +33,10 @@ use botlib::SystemLimits; use botserver::core; use botserver::shared; +use botserver::core::shared::memory_monitor::{ + start_memory_monitor, log_process_memory, MemoryStats, + register_thread, record_thread_activity +}; use botserver::core::automation; use botserver::core::bootstrap; @@ -105,8 +117,7 @@ async fn health_check_simple() -> (StatusCode, Json) { fn print_shutdown_message() { println!(); - println!("\x1b[33m✨ Thank you for using General Bots!\x1b[0m"); - println!("\x1b[36m pragmatismo.com.br\x1b[0m"); + println!("Thank you for using General Bots!"); println!(); } @@ -819,6 +830,10 @@ async fn main() -> std::io::Result<()> { botserver::core::shared::state::AttendantNotification, >(1000); + let (task_progress_tx, _task_progress_rx) = tokio::sync::broadcast::channel::< + botserver::core::shared::state::TaskProgressEvent, + >(1000); + let app_state = Arc::new(AppState { drive: Some(drive.clone()), s3_client: Some(drive), @@ -852,6 +867,7 @@ async fn main() -> std::io::Result<()> { ext }, attendant_broadcast: Some(attendant_tx), + task_progress_broadcast: Some(task_progress_tx), }); let task_scheduler = Arc::new(botserver::tasks::scheduler::TaskScheduler::new( @@ -864,6 +880,11 @@ async fn main() -> std::io::Result<()> { log::warn!("Failed to start website crawler service: {}", e); } + // Start memory monitoring - check every 30 seconds, warn if growth > 50MB + start_memory_monitor(30, 50); + info!("Memory monitor started"); + log_process_memory(); + let _ = state_tx.try_send(app_state.clone()); progress_tx.send(BootstrapProgress::BootstrapComplete).ok(); @@ -876,10 +897,6 @@ async fn main() -> std::io::Result<()> { .map(|n| n.get()) .unwrap_or(4); - let _automation_service = - botserver::core::automation::AutomationService::new(app_state.clone()); - info!("Automation service initialized with episodic memory scheduler"); - let bot_orchestrator = BotOrchestrator::new(app_state.clone()); if let Err(e) = bot_orchestrator.mount_all_bots() { error!("Failed to mount bots: {}", e); @@ -891,37 +908,57 @@ async fn main() -> std::io::Result<()> { let bucket_name = "default.gbai".to_string(); let monitor_bot_id = default_bot_id; tokio::spawn(async move { + register_thread("drive-monitor", "drive"); + trace!("[PROFILE] DriveMonitor::new starting..."); let monitor = botserver::DriveMonitor::new( drive_monitor_state, bucket_name.clone(), monitor_bot_id, ); + trace!("[PROFILE] DriveMonitor::new done, calling start_monitoring..."); info!("Starting DriveMonitor for bucket: {}", bucket_name); if let Err(e) = monitor.start_monitoring().await { error!("DriveMonitor failed: {}", e); } + trace!("[PROFILE] DriveMonitor start_monitoring returned"); }); } let automation_state = app_state.clone(); tokio::spawn(async move { + register_thread("automation-service", "automation"); let automation = AutomationService::new(automation_state); - automation.spawn().await.ok(); + trace!("[TASK] AutomationService starting, RSS={}", + MemoryStats::format_bytes(MemoryStats::current().rss_bytes)); + loop { + record_thread_activity("automation-service"); + if let Err(e) = automation.check_scheduled_tasks().await { + error!("Error checking scheduled tasks: {}", e); + } + tokio::time::sleep(tokio::time::Duration::from_secs(60)).await; + } }); let app_state_for_llm = app_state.clone(); tokio::spawn(async move { + register_thread("llm-server-init", "llm"); + eprintln!("[PROFILE] ensure_llama_servers_running starting..."); if let Err(e) = ensure_llama_servers_running(app_state_for_llm).await { error!("Failed to start LLM servers: {}", e); } + eprintln!("[PROFILE] ensure_llama_servers_running completed"); + record_thread_activity("llm-server-init"); }); trace!("Initial data setup task spawned"); + eprintln!("[PROFILE] All background tasks spawned, starting HTTP server..."); trace!("Starting HTTP server on port {}...", config.server.port); + eprintln!("[PROFILE] run_axum_server starting on port {}...", config.server.port); if let Err(e) = run_axum_server(app_state, config.server.port, worker_count).await { error!("Failed to start HTTP server: {}", e); std::process::exit(1); } + eprintln!("[PROFILE] run_axum_server returned (should not happen normally)"); if let Some(handle) = ui_handle { handle.join().ok(); diff --git a/src/security/integration.rs b/src/security/integration.rs index 260daaa52..f9b1cb941 100644 --- a/src/security/integration.rs +++ b/src/security/integration.rs @@ -95,10 +95,10 @@ impl TlsIntegration { services.insert( "minio".to_string(), ServiceUrls { - original: "http://localhost:9000".to_string(), - secure: "https://localhost:9001".to_string(), + original: "https://localhost:9000".to_string(), + secure: "https://localhost:9000".to_string(), port: 9000, - tls_port: 9001, + tls_port: 9000, }, ); diff --git a/src/tasks/mod.rs b/src/tasks/mod.rs index 8731f23e3..469c7db2d 100644 --- a/src/tasks/mod.rs +++ b/src/tasks/mod.rs @@ -351,15 +351,108 @@ pub async fn handle_task_delete( pub async fn handle_task_get( State(state): State>, - Path(id): Path, -) -> Result, StatusCode> { - let task_engine = &state.task_engine; + Path(id): Path, +) -> impl IntoResponse { + let conn = state.conn.clone(); + let task_id = id.clone(); - match task_engine.get_task(id).await { - Ok(task) => Ok(Json(task.into())), + let result = tokio::task::spawn_blocking(move || { + let mut db_conn = conn + .get() + .map_err(|e| format!("DB connection error: {}", e))?; + + #[derive(Debug, QueryableByName, serde::Serialize)] + struct AutoTaskRow { + #[diesel(sql_type = diesel::sql_types::Uuid)] + pub id: Uuid, + #[diesel(sql_type = diesel::sql_types::Text)] + pub title: String, + #[diesel(sql_type = diesel::sql_types::Text)] + pub status: String, + #[diesel(sql_type = diesel::sql_types::Text)] + pub priority: String, + #[diesel(sql_type = diesel::sql_types::Nullable)] + pub intent: Option, + #[diesel(sql_type = diesel::sql_types::Nullable)] + pub error: Option, + #[diesel(sql_type = diesel::sql_types::Float)] + pub progress: f32, + #[diesel(sql_type = diesel::sql_types::Timestamptz)] + pub created_at: chrono::DateTime, + #[diesel(sql_type = diesel::sql_types::Nullable)] + pub completed_at: Option>, + } + + let parsed_uuid = match Uuid::parse_str(&task_id) { + Ok(u) => u, + Err(_) => return Err(format!("Invalid task ID: {}", task_id)), + }; + + let task: Option = diesel::sql_query( + "SELECT id, title, status, priority, intent, error, progress, created_at, completed_at + FROM auto_tasks WHERE id = $1 LIMIT 1" + ) + .bind::(parsed_uuid) + .get_result(&mut db_conn) + .ok(); + + Ok::<_, String>(task) + }) + .await + .unwrap_or_else(|e| { + log::error!("Task query failed: {}", e); + Err(format!("Task query failed: {}", e)) + }); + + match result { + Ok(Some(task)) => { + let status_class = match task.status.as_str() { + "completed" | "done" => "completed", + "running" | "pending" => "running", + "failed" | "error" => "error", + _ => "pending" + }; + let progress_percent = (task.progress * 100.0) as u8; + let created = task.created_at.format("%Y-%m-%d %H:%M").to_string(); + let completed = task.completed_at.map(|d| d.format("%Y-%m-%d %H:%M").to_string()).unwrap_or_default(); + + let html = format!(r#" +
+

{}

+ {} +
+
+
Priority: {}
+
Created: {}
+ {} +
+
+
Progress: {}%
+
+
+
+
+ {} + {} + "#, + task.title, + status_class, + task.status, + task.priority, + created, + if !completed.is_empty() { format!(r#"
Completed: {}
"#, completed) } else { String::new() }, + progress_percent, + progress_percent, + task.intent.map(|i| format!(r#"

Intent

{}

"#, i)).unwrap_or_default(), + task.error.map(|e| format!(r#"

Error

{}

"#, e)).unwrap_or_default() + ); + (StatusCode::OK, axum::response::Html(html)).into_response() + } + Ok(None) => { + (StatusCode::NOT_FOUND, axum::response::Html("
Task not found
".to_string())).into_response() + } Err(e) => { - log::error!("Failed to get task: {}", e); - Err(StatusCode::NOT_FOUND) + (StatusCode::INTERNAL_SERVER_ERROR, axum::response::Html(format!("
{}
", e))).into_response() } } } @@ -1209,6 +1302,7 @@ pub fn configure_task_routes() -> Router> { ) .route("/api/tasks/stats", get(handle_task_stats_htmx)) .route("/api/tasks/stats/json", get(handle_task_stats)) + .route("/api/tasks/time-saved", get(handle_time_saved)) .route("/api/tasks/completed", delete(handle_clear_completed)) .route( &ApiUrls::TASK_BY_ID.replace(":id", "{id}"), @@ -1270,12 +1364,15 @@ pub async fn handle_task_list_htmx( .map_err(|e| format!("DB connection error: {}", e))?; let mut query = String::from( - "SELECT id, title, status, priority, description, due_date FROM tasks WHERE 1=1", + "SELECT id, title, status, priority, intent as description, NULL::timestamp as due_date FROM auto_tasks WHERE 1=1", ); match filter_clone.as_str() { - "active" => query.push_str(" AND status NOT IN ('done', 'completed', 'cancelled')"), - "completed" => query.push_str(" AND status IN ('done', 'completed')"), + "complete" | "completed" => query.push_str(" AND status IN ('done', 'completed')"), + "active" => query.push_str(" AND status IN ('running', 'pending', 'in_progress')"), + "awaiting" => query.push_str(" AND status IN ('awaiting_decision', 'awaiting', 'waiting')"), + "paused" => query.push_str(" AND status = 'paused'"), + "blocked" => query.push_str(" AND status IN ('blocked', 'failed', 'error')"), "priority" => query.push_str(" AND priority IN ('high', 'urgent')"), _ => {} } @@ -1297,15 +1394,8 @@ pub async fn handle_task_list_htmx( for task in tasks { let is_completed = task.status == "done" || task.status == "completed"; - let is_high_priority = task.priority == "high" || task.priority == "urgent"; let completed_class = if is_completed { "completed" } else { "" }; - let priority_class = if is_high_priority { "active" } else { "" }; - let checked = if is_completed { "checked" } else { "" }; - let status_html = format!( - r#"{}"#, - task.status, task.status - ); let due_date_html = if let Some(due) = &task.due_date { format!( r#" {}"#, @@ -1314,52 +1404,47 @@ pub async fn handle_task_list_htmx( } else { String::new() }; + let status_class = match task.status.as_str() { + "completed" | "done" => "status-complete", + "running" | "pending" | "in_progress" => "status-running", + "failed" | "error" | "blocked" => "status-error", + "paused" => "status-paused", + "awaiting" | "awaiting_decision" => "status-awaiting", + _ => "status-pending" + }; + let _ = write!( html, r#" -
- -
-
- {} -
- {status_html} - {} - {due_date_html} -
-
+
+
+ {} + {}
-
- - -
"#, task.id, task.id, - task.id, task.title, + status_class, + task.status, task.priority, task.priority, task.id, - task.id, task.id ); } @@ -1396,34 +1481,58 @@ pub async fn handle_task_stats_htmx(State(state): State>) -> impl .get() .map_err(|e| format!("DB connection error: {}", e))?; - let total: i64 = diesel::sql_query("SELECT COUNT(*) as count FROM tasks") + let total: i64 = diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks") .get_result::(&mut db_conn) .map(|r| r.count) .unwrap_or(0); let active: i64 = - diesel::sql_query("SELECT COUNT(*) as count FROM tasks WHERE status NOT IN ('done', 'completed', 'cancelled')") + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('running', 'pending', 'in_progress')") .get_result::(&mut db_conn) .map(|r| r.count) .unwrap_or(0); let completed: i64 = - diesel::sql_query("SELECT COUNT(*) as count FROM tasks WHERE status IN ('done', 'completed')") + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('done', 'completed')") + .get_result::(&mut db_conn) + .map(|r| r.count) + .unwrap_or(0); + + let awaiting: i64 = + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('awaiting_decision', 'awaiting', 'waiting')") + .get_result::(&mut db_conn) + .map(|r| r.count) + .unwrap_or(0); + + let paused: i64 = + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status = 'paused'") + .get_result::(&mut db_conn) + .map(|r| r.count) + .unwrap_or(0); + + let blocked: i64 = + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('blocked', 'failed', 'error')") .get_result::(&mut db_conn) .map(|r| r.count) .unwrap_or(0); let priority: i64 = - diesel::sql_query("SELECT COUNT(*) as count FROM tasks WHERE priority IN ('high', 'urgent')") + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE priority IN ('high', 'urgent')") .get_result::(&mut db_conn) .map(|r| r.count) .unwrap_or(0); + let time_saved = format!("{}h", completed * 2); + Ok::<_, String>(TaskStats { total: total as usize, active: active as usize, completed: completed as usize, + awaiting: awaiting as usize, + paused: paused as usize, + blocked: blocked as usize, priority: priority as usize, + time_saved, }) }) .await @@ -1435,18 +1544,25 @@ pub async fn handle_task_stats_htmx(State(state): State>) -> impl total: 0, active: 0, completed: 0, + awaiting: 0, + paused: 0, + blocked: 0, priority: 0, + time_saved: "0h".to_string(), }); let html = format!( "{} tasks ", - stats.total, stats.total, stats.active, stats.completed, stats.priority + stats.total, stats.total, stats.completed, stats.active, stats.awaiting, stats.paused, stats.blocked, stats.time_saved ); axum::response::Html(html) @@ -1460,34 +1576,58 @@ pub async fn handle_task_stats(State(state): State>) -> Json(&mut db_conn) .map(|r| r.count) .unwrap_or(0); let active: i64 = - diesel::sql_query("SELECT COUNT(*) as count FROM tasks WHERE status NOT IN ('done', 'completed', 'cancelled')") + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('running', 'pending', 'in_progress')") .get_result::(&mut db_conn) .map(|r| r.count) .unwrap_or(0); let completed: i64 = - diesel::sql_query("SELECT COUNT(*) as count FROM tasks WHERE status IN ('done', 'completed')") + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('done', 'completed')") + .get_result::(&mut db_conn) + .map(|r| r.count) + .unwrap_or(0); + + let awaiting: i64 = + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('awaiting_decision', 'awaiting', 'waiting')") + .get_result::(&mut db_conn) + .map(|r| r.count) + .unwrap_or(0); + + let paused: i64 = + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status = 'paused'") + .get_result::(&mut db_conn) + .map(|r| r.count) + .unwrap_or(0); + + let blocked: i64 = + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('blocked', 'failed', 'error')") .get_result::(&mut db_conn) .map(|r| r.count) .unwrap_or(0); let priority: i64 = - diesel::sql_query("SELECT COUNT(*) as count FROM tasks WHERE priority IN ('high', 'urgent')") + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE priority IN ('high', 'urgent')") .get_result::(&mut db_conn) .map(|r| r.count) .unwrap_or(0); + let time_saved = format!("{}h", completed * 2); + Ok::<_, String>(TaskStats { total: total as usize, active: active as usize, completed: completed as usize, + awaiting: awaiting as usize, + paused: paused as usize, + blocked: blocked as usize, priority: priority as usize, + time_saved, }) }) .await @@ -1499,12 +1639,43 @@ pub async fn handle_task_stats(State(state): State>) -> Json>) -> impl IntoResponse { + let conn = state.conn.clone(); + + let time_saved = tokio::task::spawn_blocking(move || { + let mut db_conn = match conn.get() { + Ok(c) => c, + Err(_) => return "0h".to_string(), + }; + + let completed: i64 = + diesel::sql_query("SELECT COUNT(*) as count FROM auto_tasks WHERE status IN ('done', 'completed')") + .get_result::(&mut db_conn) + .map(|r| r.count) + .unwrap_or(0); + + format!("{}h", completed * 2) + }) + .await + .unwrap_or_else(|_| "0h".to_string()); + + axum::response::Html(format!( + r#"Active Time Saved: + {}"#, + time_saved + )) +} + pub async fn handle_clear_completed(State(state): State>) -> impl IntoResponse { let conn = state.conn.clone(); @@ -1513,7 +1684,7 @@ pub async fn handle_clear_completed(State(state): State>) -> impl .get() .map_err(|e| format!("DB connection error: {}", e))?; - diesel::sql_query("DELETE FROM tasks WHERE status IN ('done', 'completed')") + diesel::sql_query("DELETE FROM auto_tasks WHERE status IN ('done', 'completed')") .execute(&mut db_conn) .map_err(|e| format!("Delete failed: {}", e))?; @@ -1595,7 +1766,11 @@ pub struct TaskStats { pub total: usize, pub active: usize, pub completed: usize, + pub awaiting: usize, + pub paused: usize, + pub blocked: usize, pub priority: usize, + pub time_saved: String, } #[derive(Debug, Serialize, Deserialize)]