botserver/src/core/kb/mod.rs
Rodrigo Rodriguez (Pragmatismo) dd15899ac3
All checks were successful
BotServer CI/CD / build (push) Successful in 5m48s
fix: Use broadcast channel for LLM streaming cancellation
- Broadcast channel allows multiple subscribers for cancellation
- Aborts LLM task when user sends new message
- Properly stops LLM generation when cancelled
2026-04-15 09:44:42 -03:00

375 lines
13 KiB
Rust

pub mod document_processor;
pub mod embedding_generator;
pub mod kb_indexer;
pub mod permissions;
pub mod web_crawler;
pub mod website_crawler_service;
pub use document_processor::{DocumentFormat, DocumentProcessor, TextChunk};
pub use embedding_generator::{
EmailEmbeddingGenerator, EmbeddingConfig, EmbeddingGenerator, KbEmbeddingGenerator,
};
pub use kb_indexer::{CollectionInfo, IndexingResult, KbFolderMonitor, KbIndexer, QdrantConfig, SearchResult};
pub use web_crawler::{WebCrawler, WebPage, WebsiteCrawlConfig};
pub use website_crawler_service::{ensure_crawler_service_running, WebsiteCrawlerService};
use anyhow::Result;
use diesel::prelude::*;
use log::{error, info, warn};
use std::path::Path;
use std::sync::Arc;
use tokio::sync::RwLock;
use uuid::Uuid;
use crate::core::shared::utils::DbPool;
#[derive(Debug)]
pub struct KnowledgeBaseManager {
indexer: Arc<KbIndexer>,
processor: Arc<DocumentProcessor>,
monitor: Arc<RwLock<KbFolderMonitor>>,
}
impl KnowledgeBaseManager {
pub fn new(work_root: impl Into<std::path::PathBuf>) -> Self {
Self::with_default_config(work_root)
}
pub fn with_default_config(work_root: impl Into<std::path::PathBuf>) -> Self {
let work_root = work_root.into();
let embedding_config = EmbeddingConfig::from_env();
let qdrant_config = QdrantConfig::default();
let indexer = Arc::new(KbIndexer::new(embedding_config.clone(), qdrant_config));
let processor = Arc::new(DocumentProcessor::default());
let monitor = Arc::new(RwLock::new(KbFolderMonitor::new(
work_root,
embedding_config,
)));
Self {
indexer,
processor,
monitor,
}
}
pub fn with_bot_config(work_root: impl Into<std::path::PathBuf>, pool: DbPool, bot_id: Uuid) -> Self {
let work_root = work_root.into();
let embedding_config = EmbeddingConfig::from_bot_config(&pool, &bot_id);
info!("KB Manager using embedding config from bot {}: url={}, model={}",
bot_id, embedding_config.embedding_url, embedding_config.embedding_model);
let qdrant_config = QdrantConfig::from_config(pool.clone(), &bot_id);
let indexer = Arc::new(KbIndexer::new_with_pool(embedding_config.clone(), qdrant_config, pool));
let processor = Arc::new(DocumentProcessor::default());
let monitor = Arc::new(RwLock::new(KbFolderMonitor::new(
work_root,
embedding_config,
)));
Self {
indexer,
processor,
monitor,
}
}
pub async fn index_kb_folder(
&self,
bot_id: Uuid,
bot_name: &str,
kb_name: &str,
kb_path: &Path,
) -> Result<()> {
info!(
"Indexing knowledge base: {} for bot {} from path: {}",
kb_name,
bot_name,
kb_path.display()
);
let result = self
.indexer
.index_kb_folder(bot_id, bot_name, kb_name, kb_path)
.await?;
info!(
"Successfully indexed {} documents with {} chunks into collection {}",
result.documents_processed, result.chunks_indexed, result.collection_name
);
Ok(())
}
pub async fn index_single_file(
&self,
bot_id: Uuid,
bot_name: &str,
kb_name: &str,
file_path: &Path,
) -> Result<kb_indexer::IndexingResult> {
info!(
"Indexing single file: {} into KB {} for bot {}",
file_path.display(),
kb_name,
bot_name
);
let result = self
.indexer
.index_single_file(bot_id, bot_name, kb_name, file_path)
.await?;
info!(
"Successfully indexed {} chunks from {} into collection {}",
result.chunks_indexed,
file_path.display(),
result.collection_name
);
Ok(result)
}
pub async fn search(
&self,
bot_id: Uuid,
bot_name: &str,
kb_name: &str,
query: &str,
limit: usize,
) -> Result<Vec<SearchResult>> {
let bot_id_short = bot_id.to_string().chars().take(8).collect::<String>();
let collection_name = format!("{}_{}_{}", bot_name, bot_id_short, kb_name);
// Use from_bot_config with state connection if available
if let Some(pool) = self.indexer.get_db_pool() {
let embedding_config = EmbeddingConfig::from_bot_config(pool, &bot_id);
self.indexer.search_with_config(&collection_name, query, limit, &embedding_config).await
} else {
// Fallback to default config
self.indexer.search(&collection_name, query, limit).await
}
}
pub async fn search_collection(
&self,
collection_name: &str,
query: &str,
limit: usize,
) -> Result<Vec<SearchResult>> {
self.indexer.search(collection_name, query, limit).await
}
pub async fn process_document(&self, file_path: &Path) -> Result<Vec<TextChunk>> {
self.processor.process_document(file_path).await
}
pub async fn handle_gbkb_change(&self, bot_id: Uuid, bot_name: &str, kb_folder: &Path) -> Result<()> {
info!(
"Handling .gbkb folder change for bot {} at {}",
bot_name,
kb_folder.display()
);
let kb_name = kb_folder
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown");
#[derive(diesel::QueryableByName)]
struct KbDocCount {
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Integer>)]
document_count: Option<i32>,
}
if let Some(pool) = self.indexer.get_db_pool() {
if let Ok(mut conn) = pool.get() {
let existing: Option<KbDocCount> = diesel::sql_query(
"SELECT document_count FROM kb_collections WHERE bot_id = $1 AND name = $2"
)
.bind::<diesel::sql_types::Uuid, _>(bot_id)
.bind::<diesel::sql_types::Text, _>(kb_name)
.get_result(&mut conn)
.ok();
if let Some(row) = existing {
if let Some(count) = row.document_count {
if count > 0 {
info!(
"KB {} for bot {}/{} already indexed with {} docs, skipping re-index",
kb_name, bot_name, bot_id, count
);
return Ok(());
}
}
}
}
}
let monitor = self.monitor.read().await;
let result = monitor.process_gbkb_folder(bot_id, bot_name, kb_folder).await?;
let kb_name = kb_folder
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown");
let collection_name = result.collection_name.clone();
let folder_path = kb_folder.to_string_lossy().to_string();
let doc_count = result.documents_processed;
if let Some(pool) = self.indexer.get_db_pool() {
if let Ok(mut conn) = pool.get() {
diesel::sql_query(
"INSERT INTO kb_collections (id, bot_id, name, folder_path, qdrant_collection, document_count)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (bot_id, name) DO UPDATE SET
folder_path = EXCLUDED.folder_path,
qdrant_collection = EXCLUDED.qdrant_collection,
document_count = EXCLUDED.document_count,
updated_at = NOW()"
)
.bind::<diesel::sql_types::Uuid, _>(Uuid::new_v4())
.bind::<diesel::sql_types::Uuid, _>(bot_id)
.bind::<diesel::sql_types::Text, _>(kb_name)
.bind::<diesel::sql_types::Text, _>(&folder_path)
.bind::<diesel::sql_types::Text, _>(&collection_name)
.bind::<diesel::sql_types::Integer, _>(doc_count as i32)
.execute(&mut conn)
.map_err(|e| {
error!("Failed to upsert kb_collections for {}/{}: {}", bot_name, kb_name, e);
e
})?;
info!(
"Upserted kb_collections: bot={}/{}, kb={}, collection={}, docs={}",
bot_name, bot_id, kb_name, collection_name, doc_count
);
} else {
warn!("No DB connection available to upsert kb_collections for {}/{}", bot_name, kb_name);
}
} else {
warn!("No DB pool available to upsert kb_collections for {}/{}", bot_name, kb_name);
}
Ok(())
}
pub async fn clear_kb(&self, bot_id: Uuid, bot_name: &str, kb_name: &str) -> Result<()> {
let bot_id_short = bot_id.to_string().chars().take(8).collect::<String>();
let collection_name = format!("{}_{}_{}", bot_name, bot_id_short, kb_name);
warn!("Clearing knowledge base collection: {}", collection_name);
match self.indexer.delete_collection(&collection_name).await {
Ok(_) => {
info!("Successfully cleared collection: {}", collection_name);
Ok(())
}
Err(e) => {
error!("Failed to clear collection {}: {}", collection_name, e);
Err(e)
}
}
}
pub async fn delete_file_from_kb(&self, bot_id: Uuid, bot_name: &str, kb_name: &str, file_path: &str) -> Result<()> {
let bot_id_short = bot_id.to_string().chars().take(8).collect::<String>();
let collection_name = format!("{}_{}_{}", bot_name, bot_id_short, kb_name);
// Use the relative path within the gbkb folder (e.g., "cartas/file.pdf")
let relative_path = file_path
.strip_prefix(&format!("{}/", kb_name))
.unwrap_or(file_path);
info!("Deleting vectors for file {} from collection {}", relative_path, collection_name);
match self.indexer.delete_file_points(&collection_name, relative_path).await {
Ok(_) => {
info!("Successfully deleted vectors for file {} from {}", relative_path, collection_name);
Ok(())
}
Err(e) => {
error!("Failed to delete vectors for file {} from {}: {}", relative_path, collection_name, e);
Err(e)
}
}
}
pub async fn get_kb_stats(&self, bot_id: Uuid, bot_name: &str, kb_name: &str) -> Result<KbStatistics> {
let bot_id_short = bot_id.to_string().chars().take(8).collect::<String>();
let collection_name = format!("{}_{}_{}", bot_name, bot_id_short, kb_name);
let collection_info = self.indexer.get_collection_info(&collection_name).await?;
let estimated_doc_count = if collection_info.points_count > 0 {
std::cmp::max(1, collection_info.points_count / 10)
} else {
0
};
let estimated_size = collection_info.points_count * 1024;
Ok(KbStatistics {
collection_name,
document_count: estimated_doc_count,
chunk_count: collection_info.points_count,
total_size_bytes: estimated_size,
status: collection_info.status,
})
}
}
#[derive(Debug, Clone)]
pub struct KbStatistics {
pub collection_name: String,
pub document_count: usize,
pub chunk_count: usize,
pub total_size_bytes: usize,
pub status: String,
}
#[derive(Debug)]
pub struct DriveMonitorIntegration {
kb_manager: Arc<KnowledgeBaseManager>,
}
impl DriveMonitorIntegration {
pub fn new(kb_manager: Arc<KnowledgeBaseManager>) -> Self {
Self { kb_manager }
}
pub async fn on_gbkb_folder_changed(
&self,
bot_id: Uuid,
bot_name: &str,
folder_path: &Path,
change_type: ChangeType,
) -> Result<()> {
match change_type {
ChangeType::Created | ChangeType::Modified => {
info!(
"Drive monitor detected {:?} in .gbkb folder: {}",
change_type,
folder_path.display()
);
self.kb_manager
.handle_gbkb_change(bot_id, bot_name, folder_path)
.await
}
ChangeType::Deleted => {
if let Some(kb_name) = folder_path.file_name().and_then(|n| n.to_str()) {
self.kb_manager.clear_kb(bot_id, bot_name, kb_name).await
} else {
Err(anyhow::anyhow!("Invalid KB folder path"))
}
}
}
}
}
#[derive(Debug, Clone, Copy)]
pub enum ChangeType {
Created,
Modified,
Deleted,
}