- Fix match arms with identical bodies by consolidating patterns - Fix case-insensitive file extension comparisons using eq_ignore_ascii_case - Fix unnecessary Debug formatting in log/format macros - Fix clone_from usage instead of clone assignment - Fix let...else patterns where appropriate - Fix format! append to String using write! macro - Fix unwrap_or with function calls to use unwrap_or_else - Add missing fields to manual Debug implementations - Fix duplicate code in if blocks - Add type aliases for complex types - Rename struct fields to avoid common prefixes - Various other clippy warning fixes Note: Some 'unused async' warnings remain for functions that are called with .await but don't contain await internally - these are kept async for API compatibility.
192 lines
5.6 KiB
Rust
192 lines
5.6 KiB
Rust
pub mod document_processor;
|
|
pub mod embedding_generator;
|
|
pub mod kb_indexer;
|
|
pub mod web_crawler;
|
|
pub mod website_crawler_service;
|
|
|
|
pub use document_processor::{DocumentFormat, DocumentProcessor, TextChunk};
|
|
pub use embedding_generator::{
|
|
EmailEmbeddingGenerator, EmbeddingConfig, EmbeddingGenerator, KbEmbeddingGenerator,
|
|
};
|
|
pub use kb_indexer::{CollectionInfo, KbFolderMonitor, KbIndexer, QdrantConfig, SearchResult};
|
|
pub use web_crawler::{WebCrawler, WebPage, WebsiteCrawlConfig};
|
|
pub use website_crawler_service::{ensure_crawler_service_running, WebsiteCrawlerService};
|
|
|
|
use anyhow::Result;
|
|
use log::{error, info, warn};
|
|
use std::path::Path;
|
|
use std::sync::Arc;
|
|
use tokio::sync::RwLock;
|
|
|
|
#[derive(Debug)]
|
|
pub struct KnowledgeBaseManager {
|
|
indexer: Arc<KbIndexer>,
|
|
processor: Arc<DocumentProcessor>,
|
|
monitor: Arc<RwLock<KbFolderMonitor>>,
|
|
}
|
|
|
|
impl KnowledgeBaseManager {
|
|
pub fn new(work_root: impl Into<std::path::PathBuf>) -> Self {
|
|
let work_root = work_root.into();
|
|
let embedding_config = EmbeddingConfig::from_env();
|
|
let qdrant_config = QdrantConfig::default();
|
|
|
|
let indexer = Arc::new(KbIndexer::new(embedding_config.clone(), qdrant_config));
|
|
let processor = Arc::new(DocumentProcessor::default());
|
|
let monitor = Arc::new(RwLock::new(KbFolderMonitor::new(
|
|
work_root,
|
|
embedding_config,
|
|
)));
|
|
|
|
Self {
|
|
indexer,
|
|
processor,
|
|
monitor,
|
|
}
|
|
}
|
|
|
|
pub async fn index_kb_folder(
|
|
&self,
|
|
bot_name: &str,
|
|
kb_name: &str,
|
|
kb_path: &Path,
|
|
) -> Result<()> {
|
|
info!(
|
|
"Indexing knowledge base: {} for bot {} from path: {}",
|
|
kb_name,
|
|
bot_name,
|
|
kb_path.display()
|
|
);
|
|
|
|
let result = self
|
|
.indexer
|
|
.index_kb_folder(bot_name, kb_name, kb_path)
|
|
.await?;
|
|
|
|
info!(
|
|
"Successfully indexed {} documents with {} chunks into collection {}",
|
|
result.documents_processed, result.chunks_indexed, result.collection_name
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub async fn search(
|
|
&self,
|
|
bot_name: &str,
|
|
kb_name: &str,
|
|
query: &str,
|
|
limit: usize,
|
|
) -> Result<Vec<SearchResult>> {
|
|
let collection_name = format!("{}_{}", bot_name, kb_name);
|
|
self.indexer.search(&collection_name, query, limit).await
|
|
}
|
|
|
|
pub async fn process_document(&self, file_path: &Path) -> Result<Vec<TextChunk>> {
|
|
self.processor.process_document(file_path).await
|
|
}
|
|
|
|
pub async fn handle_gbkb_change(&self, bot_name: &str, kb_folder: &Path) -> Result<()> {
|
|
info!(
|
|
"Handling .gbkb folder change for bot {} at {}",
|
|
bot_name,
|
|
kb_folder.display()
|
|
);
|
|
|
|
let monitor = self.monitor.read().await;
|
|
monitor.process_gbkb_folder(bot_name, kb_folder).await
|
|
}
|
|
|
|
pub async fn clear_kb(&self, bot_name: &str, kb_name: &str) -> Result<()> {
|
|
let collection_name = format!("{}_{}", bot_name, kb_name);
|
|
|
|
warn!("Clearing knowledge base collection: {}", collection_name);
|
|
|
|
match self.indexer.delete_collection(&collection_name).await {
|
|
Ok(_) => {
|
|
info!("Successfully cleared collection: {}", collection_name);
|
|
Ok(())
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to clear collection {}: {}", collection_name, e);
|
|
Err(e)
|
|
}
|
|
}
|
|
}
|
|
|
|
pub async fn get_kb_stats(&self, bot_name: &str, kb_name: &str) -> Result<KbStatistics> {
|
|
let collection_name = format!("{}_{}", bot_name, kb_name);
|
|
|
|
let collection_info = self.indexer.get_collection_info(&collection_name).await?;
|
|
|
|
let estimated_doc_count = if collection_info.points_count > 0 {
|
|
std::cmp::max(1, collection_info.points_count / 10)
|
|
} else {
|
|
0
|
|
};
|
|
|
|
let estimated_size = collection_info.points_count * 1024;
|
|
|
|
Ok(KbStatistics {
|
|
collection_name,
|
|
document_count: estimated_doc_count,
|
|
chunk_count: collection_info.points_count,
|
|
total_size_bytes: estimated_size,
|
|
status: collection_info.status,
|
|
})
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct KbStatistics {
|
|
pub collection_name: String,
|
|
pub document_count: usize,
|
|
pub chunk_count: usize,
|
|
pub total_size_bytes: usize,
|
|
pub status: String,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct DriveMonitorIntegration {
|
|
kb_manager: Arc<KnowledgeBaseManager>,
|
|
}
|
|
|
|
impl DriveMonitorIntegration {
|
|
pub fn new(kb_manager: Arc<KnowledgeBaseManager>) -> Self {
|
|
Self { kb_manager }
|
|
}
|
|
|
|
pub async fn on_gbkb_folder_changed(
|
|
&self,
|
|
bot_name: &str,
|
|
folder_path: &Path,
|
|
change_type: ChangeType,
|
|
) -> Result<()> {
|
|
match change_type {
|
|
ChangeType::Created | ChangeType::Modified => {
|
|
info!(
|
|
"Drive monitor detected {:?} in .gbkb folder: {}",
|
|
change_type,
|
|
folder_path.display()
|
|
);
|
|
self.kb_manager
|
|
.handle_gbkb_change(bot_name, folder_path)
|
|
.await
|
|
}
|
|
ChangeType::Deleted => {
|
|
if let Some(kb_name) = folder_path.file_name().and_then(|n| n.to_str()) {
|
|
self.kb_manager.clear_kb(bot_name, kb_name).await
|
|
} else {
|
|
Err(anyhow::anyhow!("Invalid KB folder path"))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub enum ChangeType {
|
|
Created,
|
|
Modified,
|
|
Deleted,
|
|
}
|