From 7a1ec157f10f6aba6e7fa329fad30219da4049d4 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Sat, 11 Apr 2026 21:26:02 -0300 Subject: [PATCH] Fix KB indexing: upsert kb_collections, consistent collection names, preserve indexed flag - Bug 1: check_gbkb_changes now preserves indexed=true from previous state when etag matches, preventing redundant re-indexing every cycle - Bug 2: USE KB fallback uses bot_id_short (8 chars) instead of random UUID, matching the collection name convention used by DriveMonitor - Bug 3: handle_gbkb_change now upserts into kb_collections table after successful indexing, so USE KB can find the collection at runtime - Changed ON CONFLICT DO NOTHING to DO UPDATE for kb_collections inserts - Changed process_gbkb_folder return type to Result --- src/basic/keywords/use_kb.rs | 11 +++++--- src/core/kb/kb_indexer.rs | 4 +-- src/core/kb/mod.rs | 48 ++++++++++++++++++++++++++++++++-- src/drive/drive_monitor/mod.rs | 7 ++++- 4 files changed, 61 insertions(+), 9 deletions(-) diff --git a/src/basic/keywords/use_kb.rs b/src/basic/keywords/use_kb.rs index 01e74e71..f11e8629 100644 --- a/src/basic/keywords/use_kb.rs +++ b/src/basic/keywords/use_kb.rs @@ -220,18 +220,21 @@ fn add_kb_to_session( (kb_result.folder_path, kb_result.qdrant_collection) } else { let default_path = format!("work/{}/{}.gbkb/{}", bot_name, bot_name, kb_name); + let bot_id_short: String = bot_id.to_string().chars().take(8).collect(); + let default_collection = format!("{}_{}_{}", bot_name, bot_id_short, kb_name); let kb_id = Uuid::new_v4(); - let default_collection = format!("{}_{}_{}", bot_name, kb_id, kb_name); warn!( - "KB '{}' not found in kb_collections for bot {}. Using default path: {}", - kb_name, bot_name, default_path + "KB '{}' not found in kb_collections for bot {}. Using default path: {}, collection: {}", + kb_name, bot_name, default_path, default_collection ); diesel::sql_query( "INSERT INTO kb_collections (id, bot_id, name, folder_path, qdrant_collection, document_count) VALUES ($1, $2, $3, $4, $5, 0) - ON CONFLICT (bot_id, name) DO NOTHING" + ON CONFLICT (bot_id, name) DO UPDATE SET + folder_path = EXCLUDED.folder_path, + qdrant_collection = EXCLUDED.qdrant_collection" ) .bind::(kb_id) .bind::(bot_id) diff --git a/src/core/kb/kb_indexer.rs b/src/core/kb/kb_indexer.rs index 4372ac3c..cfe0af65 100644 --- a/src/core/kb/kb_indexer.rs +++ b/src/core/kb/kb_indexer.rs @@ -941,7 +941,7 @@ impl KbFolderMonitor { Self { indexer, work_root } } - pub async fn process_gbkb_folder(&self, bot_id: Uuid, bot_name: &str, kb_folder: &Path) -> Result<()> { + pub async fn process_gbkb_folder(&self, bot_id: Uuid, bot_name: &str, kb_folder: &Path) -> Result { let kb_name = kb_folder .file_name() .and_then(|n| n.to_str()) @@ -965,6 +965,6 @@ impl KbFolderMonitor { result.documents_processed, result.chunks_indexed, result.collection_name ); - Ok(()) + Ok(result) } } diff --git a/src/core/kb/mod.rs b/src/core/kb/mod.rs index ddac2ad0..0e99772d 100644 --- a/src/core/kb/mod.rs +++ b/src/core/kb/mod.rs @@ -9,11 +9,12 @@ pub use document_processor::{DocumentFormat, DocumentProcessor, TextChunk}; pub use embedding_generator::{ EmailEmbeddingGenerator, EmbeddingConfig, EmbeddingGenerator, KbEmbeddingGenerator, }; -pub use kb_indexer::{CollectionInfo, KbFolderMonitor, KbIndexer, QdrantConfig, SearchResult}; +pub use kb_indexer::{CollectionInfo, IndexingResult, KbFolderMonitor, KbIndexer, QdrantConfig, SearchResult}; pub use web_crawler::{WebCrawler, WebPage, WebsiteCrawlConfig}; pub use website_crawler_service::{ensure_crawler_service_running, WebsiteCrawlerService}; use anyhow::Result; +use diesel::prelude::*; use log::{error, info, warn}; use std::path::Path; use std::sync::Arc; @@ -172,7 +173,50 @@ impl KnowledgeBaseManager { ); let monitor = self.monitor.read().await; - monitor.process_gbkb_folder(bot_id, bot_name, kb_folder).await + let result = monitor.process_gbkb_folder(bot_id, bot_name, kb_folder).await?; + + let kb_name = kb_folder + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown"); + let collection_name = result.collection_name.clone(); + let folder_path = kb_folder.to_string_lossy().to_string(); + let doc_count = result.documents_processed; + + if let Some(pool) = self.indexer.get_db_pool() { + if let Ok(mut conn) = pool.get() { + diesel::sql_query( + "INSERT INTO kb_collections (id, bot_id, name, folder_path, qdrant_collection, document_count) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (bot_id, name) DO UPDATE SET + folder_path = EXCLUDED.folder_path, + qdrant_collection = EXCLUDED.qdrant_collection, + document_count = EXCLUDED.document_count, + updated_at = NOW()" + ) + .bind::(Uuid::new_v4()) + .bind::(bot_id) + .bind::(kb_name) + .bind::(&folder_path) + .bind::(&collection_name) + .bind::(doc_count as i32) + .execute(&mut conn) + .map_err(|e| { + error!("Failed to upsert kb_collections for {}/{}: {}", bot_name, kb_name, e); + e + })?; + info!( + "Upserted kb_collections: bot={}/{}, kb={}, collection={}, docs={}", + bot_name, bot_id, kb_name, collection_name, doc_count + ); + } else { + warn!("No DB connection available to upsert kb_collections for {}/{}", bot_name, kb_name); + } + } else { + warn!("No DB pool available to upsert kb_collections for {}/{}", bot_name, kb_name); + } + + Ok(()) } pub async fn clear_kb(&self, bot_id: Uuid, bot_name: &str, kb_name: &str) -> Result<()> { diff --git a/src/drive/drive_monitor/mod.rs b/src/drive/drive_monitor/mod.rs index 44634a23..7b3303cb 100644 --- a/src/drive/drive_monitor/mod.rs +++ b/src/drive/drive_monitor/mod.rs @@ -1373,7 +1373,12 @@ impl DriveMonitor { files_processed, pdf_files_found ); } - for (path, state) in current_files { + for (path, mut state) in current_files { + if let Some(previous) = file_states.get(&path) { + if previous.indexed && state.etag == previous.etag { + state.indexed = true; + } + } file_states.insert(path, state); }