botserver/src/core/kb/mod.rs
Rodrigo Rodriguez (Pragmatismo) db06927e24 Looking at the diff, I can see this commit removes the api_router.rs
file and distributes its functionality to individual modules. The
calendar and task modules now have their own route configuration and API
handlers.

Remove centralized API router in favor of module-based routing

Decentralizes API route configuration by moving route definitions and
handlers to their respective modules. Each module now exports its own
`configure_*_routes()` function that is merged in main.rs.

- Delete api_router.rs with its mon
2025-11-27 09:38:50 -03:00

216 lines
6.4 KiB
Rust

pub mod document_processor;
pub mod embedding_generator;
pub mod kb_indexer;
pub mod web_crawler;
pub mod website_crawler_service;
pub use document_processor::{DocumentFormat, DocumentProcessor, TextChunk};
pub use embedding_generator::{
EmailEmbeddingGenerator, EmbeddingConfig, EmbeddingGenerator, KbEmbeddingGenerator,
};
pub use kb_indexer::{KbFolderMonitor, KbIndexer, QdrantConfig, SearchResult};
pub use web_crawler::{WebCrawler, WebPage, WebsiteCrawlConfig};
pub use website_crawler_service::{ensure_crawler_service_running, WebsiteCrawlerService};
use anyhow::Result;
use log::{error, info, warn};
use std::path::Path;
use std::sync::Arc;
use tokio::sync::RwLock;
/// Main Knowledge Base manager
#[derive(Debug)]
pub struct KnowledgeBaseManager {
indexer: Arc<KbIndexer>,
processor: Arc<DocumentProcessor>,
monitor: Arc<RwLock<KbFolderMonitor>>,
}
impl KnowledgeBaseManager {
/// Create new KB manager with default configuration
pub fn new(work_root: impl Into<std::path::PathBuf>) -> Self {
let work_root = work_root.into();
let embedding_config = EmbeddingConfig::from_env();
let qdrant_config = QdrantConfig::default();
let indexer = Arc::new(KbIndexer::new(embedding_config.clone(), qdrant_config));
let processor = Arc::new(DocumentProcessor::default());
let monitor = Arc::new(RwLock::new(KbFolderMonitor::new(
work_root,
embedding_config,
)));
Self {
indexer,
processor,
monitor,
}
}
/// Process and index a knowledge base folder
pub async fn index_kb_folder(
&self,
bot_name: &str,
kb_name: &str,
kb_path: &Path,
) -> Result<()> {
info!(
"Indexing knowledge base: {} for bot {} from path: {:?}",
kb_name, bot_name, kb_path
);
// Index the folder using the indexer
let result = self
.indexer
.index_kb_folder(bot_name, kb_name, kb_path)
.await?;
info!(
"Successfully indexed {} documents with {} chunks into collection {}",
result.documents_processed, result.chunks_indexed, result.collection_name
);
Ok(())
}
/// Search in a knowledge base
pub async fn search(
&self,
bot_name: &str,
kb_name: &str,
query: &str,
limit: usize,
) -> Result<Vec<SearchResult>> {
let collection_name = format!("{}_{}", bot_name, kb_name);
self.indexer.search(&collection_name, query, limit).await
}
/// Process a single document
pub async fn process_document(&self, file_path: &Path) -> Result<Vec<TextChunk>> {
self.processor.process_document(file_path).await
}
/// Handle .gbkb folder change notification from drive monitor
pub async fn handle_gbkb_change(&self, bot_name: &str, kb_folder: &Path) -> Result<()> {
info!(
"Handling .gbkb folder change for bot {} at {:?}",
bot_name, kb_folder
);
let monitor = self.monitor.read().await;
monitor.process_gbkb_folder(bot_name, kb_folder).await
}
/// Clear a knowledge base collection
pub async fn clear_kb(&self, bot_name: &str, kb_name: &str) -> Result<()> {
let collection_name = format!("{}_{}", bot_name, kb_name);
warn!("Clearing knowledge base collection: {}", collection_name);
match self.indexer.delete_collection(&collection_name).await {
Ok(_) => {
info!("Successfully cleared collection: {}", collection_name);
Ok(())
}
Err(e) => {
error!("Failed to clear collection {}: {}", collection_name, e);
Err(e)
}
}
}
/// Get collection statistics
pub async fn get_kb_stats(&self, bot_name: &str, kb_name: &str) -> Result<KbStatistics> {
let collection_name = format!("{}_{}", bot_name, kb_name);
// This would query Qdrant for collection statistics
// For now, return placeholder stats
Ok(KbStatistics {
collection_name,
document_count: 0,
chunk_count: 0,
total_size_bytes: 0,
})
}
}
/// Statistics for a knowledge base
#[derive(Debug, Clone)]
pub struct KbStatistics {
pub collection_name: String,
pub document_count: usize,
pub chunk_count: usize,
pub total_size_bytes: usize,
}
/// Integration with drive monitor
#[derive(Debug)]
pub struct DriveMonitorIntegration {
kb_manager: Arc<KnowledgeBaseManager>,
}
impl DriveMonitorIntegration {
pub fn new(kb_manager: Arc<KnowledgeBaseManager>) -> Self {
Self { kb_manager }
}
/// Called when drive monitor detects changes in .gbkb folder
pub async fn on_gbkb_folder_changed(
&self,
bot_name: &str,
folder_path: &Path,
change_type: ChangeType,
) -> Result<()> {
match change_type {
ChangeType::Created | ChangeType::Modified => {
info!(
"Drive monitor detected {:?} in .gbkb folder: {:?}",
change_type, folder_path
);
self.kb_manager
.handle_gbkb_change(bot_name, folder_path)
.await
}
ChangeType::Deleted => {
// Extract KB name from path
if let Some(kb_name) = folder_path.file_name().and_then(|n| n.to_str()) {
self.kb_manager.clear_kb(bot_name, kb_name).await
} else {
Err(anyhow::anyhow!("Invalid KB folder path"))
}
}
}
}
}
/// Types of changes detected by drive monitor
#[derive(Debug, Clone, Copy)]
pub enum ChangeType {
Created,
Modified,
Deleted,
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[tokio::test]
async fn test_kb_manager_creation() {
let temp_dir = TempDir::new().unwrap();
let manager = KnowledgeBaseManager::new(temp_dir.path());
// Test that manager is created successfully
assert!(manager.processor.chunk_size() == 1000);
assert!(manager.processor.chunk_overlap() == 200);
}
#[test]
fn test_collection_naming() {
let bot_name = "testbot";
let kb_name = "docs";
let collection_name = format!("{}_{}", bot_name, kb_name);
assert_eq!(collection_name, "testbot_docs");
}
}