2025-11-26 22:54:22 -03:00
|
|
|
use anyhow::Result;
|
|
|
|
|
use log::{debug, info, warn};
|
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
use std::path::{Path, PathBuf};
|
|
|
|
|
use uuid::Uuid;
|
|
|
|
|
|
|
|
|
|
use super::document_processor::{DocumentProcessor, TextChunk};
|
|
|
|
|
use super::embedding_generator::{Embedding, EmbeddingConfig, KbEmbeddingGenerator};
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
|
pub struct QdrantConfig {
|
|
|
|
|
pub url: String,
|
|
|
|
|
pub api_key: Option<String>,
|
|
|
|
|
pub timeout_secs: u64,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Default for QdrantConfig {
|
|
|
|
|
fn default() -> Self {
|
|
|
|
|
Self {
|
Add .env.example with comprehensive configuration template
The commit adds a complete example environment configuration file
documenting all available settings for BotServer, including logging,
database, server, drive, LLM, Redis, email, and feature flags.
Also removes hardcoded environment variable usage throughout the
codebase, replacing them with configuration via config.csv or
appropriate defaults. This includes:
- WhatsApp, Teams, Instagram adapter configurations
- Weather API key handling
- Email and directory service configurations
- Console feature conditionally compiles monitoring code
- Improved logging configuration with library suppression
2025-11-28 13:19:03 -03:00
|
|
|
url: "http://localhost:6333".to_string(),
|
|
|
|
|
api_key: None,
|
2025-11-26 22:54:22 -03:00
|
|
|
timeout_secs: 30,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
pub struct QdrantPoint {
|
|
|
|
|
pub id: String,
|
|
|
|
|
pub vector: Vec<f32>,
|
|
|
|
|
pub payload: HashMap<String, serde_json::Value>,
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct CollectionConfig {
|
|
|
|
|
pub vectors: VectorConfig,
|
|
|
|
|
pub replication_factor: u32,
|
|
|
|
|
pub shard_number: u32,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct VectorConfig {
|
|
|
|
|
pub size: usize,
|
|
|
|
|
pub distance: String,
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct SearchRequest {
|
|
|
|
|
pub vector: Vec<f32>,
|
|
|
|
|
pub limit: usize,
|
|
|
|
|
pub with_payload: bool,
|
|
|
|
|
pub score_threshold: Option<f32>,
|
|
|
|
|
pub filter: Option<serde_json::Value>,
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
pub struct KbIndexer {
|
|
|
|
|
document_processor: DocumentProcessor,
|
|
|
|
|
embedding_generator: KbEmbeddingGenerator,
|
|
|
|
|
qdrant_config: QdrantConfig,
|
|
|
|
|
http_client: reqwest::Client,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl std::fmt::Debug for KbIndexer {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
|
f.debug_struct("KbIndexer")
|
|
|
|
|
.field("document_processor", &self.document_processor)
|
|
|
|
|
.field("embedding_generator", &self.embedding_generator)
|
|
|
|
|
.field("qdrant_config", &self.qdrant_config)
|
|
|
|
|
.field("http_client", &"reqwest::Client")
|
|
|
|
|
.finish()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl KbIndexer {
|
|
|
|
|
pub fn new(embedding_config: EmbeddingConfig, qdrant_config: QdrantConfig) -> Self {
|
|
|
|
|
let document_processor = DocumentProcessor::default();
|
|
|
|
|
let embedding_generator = KbEmbeddingGenerator::new(embedding_config);
|
|
|
|
|
|
|
|
|
|
let http_client = reqwest::Client::builder()
|
|
|
|
|
.timeout(std::time::Duration::from_secs(qdrant_config.timeout_secs))
|
|
|
|
|
.build()
|
|
|
|
|
.expect("Failed to create HTTP client");
|
|
|
|
|
|
|
|
|
|
Self {
|
|
|
|
|
document_processor,
|
|
|
|
|
embedding_generator,
|
|
|
|
|
qdrant_config,
|
|
|
|
|
http_client,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
pub async fn index_kb_folder(
|
|
|
|
|
&self,
|
|
|
|
|
bot_name: &str,
|
|
|
|
|
kb_name: &str,
|
|
|
|
|
kb_path: &Path,
|
|
|
|
|
) -> Result<IndexingResult> {
|
|
|
|
|
info!("Indexing KB folder: {} for bot {}", kb_name, bot_name);
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let collection_name = format!("{}_{}", bot_name, kb_name);
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
self.ensure_collection_exists(&collection_name).await?;
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let documents = self.document_processor.process_kb_folder(kb_path).await?;
|
|
|
|
|
|
|
|
|
|
let mut total_chunks = 0;
|
|
|
|
|
let mut indexed_documents = 0;
|
|
|
|
|
|
|
|
|
|
for (doc_path, chunks) in documents {
|
|
|
|
|
if chunks.is_empty() {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!(
|
|
|
|
|
"Processing document: {} ({} chunks)",
|
|
|
|
|
doc_path,
|
|
|
|
|
chunks.len()
|
|
|
|
|
);
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let embeddings = self
|
|
|
|
|
.embedding_generator
|
|
|
|
|
.generate_embeddings(&chunks)
|
|
|
|
|
.await?;
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let points = self.create_qdrant_points(&doc_path, embeddings)?;
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
self.upsert_points(&collection_name, points).await?;
|
|
|
|
|
|
|
|
|
|
total_chunks += chunks.len();
|
|
|
|
|
indexed_documents += 1;
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
self.update_collection_metadata(&collection_name, bot_name, kb_name, total_chunks)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
Ok(IndexingResult {
|
|
|
|
|
collection_name,
|
|
|
|
|
documents_processed: indexed_documents,
|
|
|
|
|
chunks_indexed: total_chunks,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
async fn ensure_collection_exists(&self, collection_name: &str) -> Result<()> {
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let check_url = format!("{}/collections/{}", self.qdrant_config.url, collection_name);
|
|
|
|
|
|
|
|
|
|
let response = self.http_client.get(&check_url).send().await?;
|
|
|
|
|
|
|
|
|
|
if response.status().is_success() {
|
|
|
|
|
info!("Collection {} already exists", collection_name);
|
|
|
|
|
return Ok(());
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
info!("Creating collection: {}", collection_name);
|
|
|
|
|
|
|
|
|
|
let config = CollectionConfig {
|
|
|
|
|
vectors: VectorConfig {
|
2025-12-23 18:40:58 -03:00
|
|
|
size: 384,
|
2025-11-26 22:54:22 -03:00
|
|
|
distance: "Cosine".to_string(),
|
|
|
|
|
},
|
|
|
|
|
replication_factor: 1,
|
|
|
|
|
shard_number: 1,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let create_url = format!("{}/collections/{}", self.qdrant_config.url, collection_name);
|
|
|
|
|
|
|
|
|
|
let response = self
|
|
|
|
|
.http_client
|
|
|
|
|
.put(&create_url)
|
|
|
|
|
.json(&config)
|
|
|
|
|
.send()
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
let error_text = response.text().await.unwrap_or_default();
|
|
|
|
|
return Err(anyhow::anyhow!(
|
|
|
|
|
"Failed to create collection: {}",
|
|
|
|
|
error_text
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
self.create_collection_indexes(collection_name).await?;
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
async fn create_collection_indexes(&self, collection_name: &str) -> Result<()> {
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let index_config = serde_json::json!({
|
|
|
|
|
"hnsw_config": {
|
|
|
|
|
"m": 16,
|
|
|
|
|
"ef_construct": 200,
|
|
|
|
|
"full_scan_threshold": 10000
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
let index_url = format!(
|
|
|
|
|
"{}/collections/{}/index",
|
|
|
|
|
self.qdrant_config.url, collection_name
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let response = self
|
|
|
|
|
.http_client
|
|
|
|
|
.put(&index_url)
|
|
|
|
|
.json(&index_config)
|
|
|
|
|
.send()
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
warn!("Failed to create index, using defaults");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
fn create_qdrant_points(
|
|
|
|
|
&self,
|
|
|
|
|
doc_path: &str,
|
|
|
|
|
embeddings: Vec<(TextChunk, Embedding)>,
|
|
|
|
|
) -> Result<Vec<QdrantPoint>> {
|
|
|
|
|
let mut points = Vec::new();
|
|
|
|
|
|
|
|
|
|
for (chunk, embedding) in embeddings {
|
|
|
|
|
let point_id = Uuid::new_v4().to_string();
|
|
|
|
|
|
|
|
|
|
let mut payload = HashMap::new();
|
|
|
|
|
payload.insert(
|
|
|
|
|
"content".to_string(),
|
|
|
|
|
serde_json::Value::String(chunk.content),
|
|
|
|
|
);
|
|
|
|
|
payload.insert(
|
|
|
|
|
"document_path".to_string(),
|
|
|
|
|
serde_json::Value::String(doc_path.to_string()),
|
|
|
|
|
);
|
|
|
|
|
payload.insert(
|
|
|
|
|
"chunk_index".to_string(),
|
|
|
|
|
serde_json::Value::Number(chunk.metadata.chunk_index.into()),
|
|
|
|
|
);
|
|
|
|
|
payload.insert(
|
|
|
|
|
"total_chunks".to_string(),
|
|
|
|
|
serde_json::Value::Number(chunk.metadata.total_chunks.into()),
|
|
|
|
|
);
|
|
|
|
|
payload.insert(
|
|
|
|
|
"start_char".to_string(),
|
|
|
|
|
serde_json::Value::Number(chunk.metadata.start_char.into()),
|
|
|
|
|
);
|
|
|
|
|
payload.insert(
|
|
|
|
|
"end_char".to_string(),
|
|
|
|
|
serde_json::Value::Number(chunk.metadata.end_char.into()),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if let Some(title) = chunk.metadata.document_title {
|
|
|
|
|
payload.insert(
|
|
|
|
|
"document_title".to_string(),
|
|
|
|
|
serde_json::Value::String(title),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
points.push(QdrantPoint {
|
|
|
|
|
id: point_id,
|
|
|
|
|
vector: embedding.vector,
|
|
|
|
|
payload,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(points)
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
async fn upsert_points(&self, collection_name: &str, points: Vec<QdrantPoint>) -> Result<()> {
|
|
|
|
|
if points.is_empty() {
|
|
|
|
|
return Ok(());
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
let batch_size = 100;
|
2025-11-26 22:54:22 -03:00
|
|
|
|
|
|
|
|
for batch in points.chunks(batch_size) {
|
|
|
|
|
let upsert_request = serde_json::json!({
|
|
|
|
|
"points": batch
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
let upsert_url = format!(
|
|
|
|
|
"{}/collections/{}/points?wait=true",
|
|
|
|
|
self.qdrant_config.url, collection_name
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let response = self
|
|
|
|
|
.http_client
|
|
|
|
|
.put(&upsert_url)
|
|
|
|
|
.json(&upsert_request)
|
|
|
|
|
.send()
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
let error_text = response.text().await.unwrap_or_default();
|
|
|
|
|
return Err(anyhow::anyhow!("Failed to upsert points: {}", error_text));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
debug!(
|
|
|
|
|
"Upserted {} points to collection {}",
|
|
|
|
|
points.len(),
|
|
|
|
|
collection_name
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
async fn update_collection_metadata(
|
|
|
|
|
&self,
|
|
|
|
|
collection_name: &str,
|
|
|
|
|
bot_name: &str,
|
|
|
|
|
kb_name: &str,
|
|
|
|
|
document_count: usize,
|
|
|
|
|
) -> Result<()> {
|
2025-12-23 18:40:58 -03:00
|
|
|
|
|
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
info!(
|
|
|
|
|
"Updated collection {} metadata: bot={}, kb={}, docs={}",
|
|
|
|
|
collection_name, bot_name, kb_name, document_count
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
pub async fn search(
|
|
|
|
|
&self,
|
|
|
|
|
collection_name: &str,
|
|
|
|
|
query: &str,
|
|
|
|
|
limit: usize,
|
|
|
|
|
) -> Result<Vec<SearchResult>> {
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let embedding = self
|
|
|
|
|
.embedding_generator
|
|
|
|
|
.generate_single_embedding(query)
|
|
|
|
|
.await?;
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let search_request = SearchRequest {
|
|
|
|
|
vector: embedding.vector,
|
|
|
|
|
limit,
|
|
|
|
|
with_payload: true,
|
2025-12-23 18:40:58 -03:00
|
|
|
score_threshold: Some(0.5),
|
2025-11-26 22:54:22 -03:00
|
|
|
filter: None,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let search_url = format!(
|
|
|
|
|
"{}/collections/{}/points/search",
|
|
|
|
|
self.qdrant_config.url, collection_name
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let response = self
|
|
|
|
|
.http_client
|
|
|
|
|
.post(&search_url)
|
|
|
|
|
.json(&search_request)
|
|
|
|
|
.send()
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
let error_text = response.text().await.unwrap_or_default();
|
|
|
|
|
return Err(anyhow::anyhow!("Search failed: {}", error_text));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let response_json: serde_json::Value = response.json().await?;
|
|
|
|
|
|
|
|
|
|
let mut results = Vec::new();
|
|
|
|
|
|
|
|
|
|
if let Some(result_array) = response_json["result"].as_array() {
|
|
|
|
|
for item in result_array {
|
|
|
|
|
if let (Some(score), Some(payload)) =
|
|
|
|
|
(item["score"].as_f64(), item["payload"].as_object())
|
|
|
|
|
{
|
|
|
|
|
let content = payload
|
|
|
|
|
.get("content")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.unwrap_or("")
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
let document_path = payload
|
|
|
|
|
.get("document_path")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.unwrap_or("")
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
results.push(SearchResult {
|
|
|
|
|
content,
|
|
|
|
|
document_path,
|
|
|
|
|
score: score as f32,
|
|
|
|
|
metadata: payload.clone(),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(results)
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
pub async fn delete_collection(&self, collection_name: &str) -> Result<()> {
|
|
|
|
|
let delete_url = format!("{}/collections/{}", self.qdrant_config.url, collection_name);
|
|
|
|
|
|
|
|
|
|
let response = self.http_client.delete(&delete_url).send().await?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
let error_text = response.text().await.unwrap_or_default();
|
|
|
|
|
warn!(
|
|
|
|
|
"Failed to delete collection {}: {}",
|
|
|
|
|
collection_name, error_text
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
2025-12-03 22:23:30 -03:00
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-12-03 22:23:30 -03:00
|
|
|
pub async fn get_collection_info(&self, collection_name: &str) -> Result<CollectionInfo> {
|
|
|
|
|
let info_url = format!("{}/collections/{}", self.qdrant_config.url, collection_name);
|
|
|
|
|
|
|
|
|
|
let response = self.http_client.get(&info_url).send().await?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
let status = response.status();
|
|
|
|
|
if status.as_u16() == 404 {
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-12-03 22:23:30 -03:00
|
|
|
return Ok(CollectionInfo {
|
|
|
|
|
name: collection_name.to_string(),
|
|
|
|
|
points_count: 0,
|
|
|
|
|
vectors_count: 0,
|
|
|
|
|
indexed_vectors_count: 0,
|
|
|
|
|
segments_count: 0,
|
|
|
|
|
status: "not_found".to_string(),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
let error_text = response.text().await.unwrap_or_default();
|
|
|
|
|
return Err(anyhow::anyhow!(
|
|
|
|
|
"Failed to get collection info: {}",
|
|
|
|
|
error_text
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let response_json: serde_json::Value = response.json().await?;
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-12-03 22:23:30 -03:00
|
|
|
let result = &response_json["result"];
|
|
|
|
|
|
|
|
|
|
let points_count = result["points_count"].as_u64().unwrap_or(0) as usize;
|
|
|
|
|
let vectors_count = result["vectors_count"]
|
|
|
|
|
.as_u64()
|
|
|
|
|
.or_else(|| {
|
|
|
|
|
result["vectors_count"]
|
|
|
|
|
.as_object()
|
|
|
|
|
.map(|_| points_count as u64)
|
|
|
|
|
})
|
|
|
|
|
.unwrap_or(0) as usize;
|
|
|
|
|
let indexed_vectors_count = result["indexed_vectors_count"]
|
|
|
|
|
.as_u64()
|
|
|
|
|
.unwrap_or(vectors_count as u64) as usize;
|
|
|
|
|
let segments_count = result["segments_count"].as_u64().unwrap_or(0) as usize;
|
|
|
|
|
let status = result["status"].as_str().unwrap_or("unknown").to_string();
|
|
|
|
|
|
|
|
|
|
Ok(CollectionInfo {
|
|
|
|
|
name: collection_name.to_string(),
|
|
|
|
|
points_count,
|
|
|
|
|
vectors_count,
|
|
|
|
|
indexed_vectors_count,
|
|
|
|
|
segments_count,
|
|
|
|
|
status,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-12-03 22:23:30 -03:00
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
|
pub struct CollectionInfo {
|
|
|
|
|
pub name: String,
|
|
|
|
|
pub points_count: usize,
|
|
|
|
|
pub vectors_count: usize,
|
|
|
|
|
pub indexed_vectors_count: usize,
|
|
|
|
|
pub segments_count: usize,
|
|
|
|
|
pub status: String,
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
#[derive(Debug)]
|
|
|
|
|
pub struct IndexingResult {
|
|
|
|
|
pub collection_name: String,
|
|
|
|
|
pub documents_processed: usize,
|
|
|
|
|
pub chunks_indexed: usize,
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
|
pub struct SearchResult {
|
|
|
|
|
pub content: String,
|
|
|
|
|
pub document_path: String,
|
|
|
|
|
pub score: f32,
|
|
|
|
|
pub metadata: serde_json::Map<String, serde_json::Value>,
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
#[derive(Debug)]
|
|
|
|
|
pub struct KbFolderMonitor {
|
|
|
|
|
indexer: KbIndexer,
|
|
|
|
|
work_root: PathBuf,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl KbFolderMonitor {
|
|
|
|
|
pub fn new(work_root: PathBuf, embedding_config: EmbeddingConfig) -> Self {
|
|
|
|
|
let qdrant_config = QdrantConfig::default();
|
|
|
|
|
let indexer = KbIndexer::new(embedding_config, qdrant_config);
|
|
|
|
|
|
|
|
|
|
Self { indexer, work_root }
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
pub async fn process_gbkb_folder(&self, bot_name: &str, kb_folder: &Path) -> Result<()> {
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let kb_name = kb_folder
|
|
|
|
|
.file_name()
|
|
|
|
|
.and_then(|n| n.to_str())
|
|
|
|
|
.ok_or_else(|| anyhow::anyhow!("Invalid KB folder name"))?;
|
|
|
|
|
|
|
|
|
|
info!("Processing .gbkb folder: {} for bot {}", kb_name, bot_name);
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let local_path = self
|
|
|
|
|
.work_root
|
|
|
|
|
.join(bot_name)
|
|
|
|
|
.join(format!("{}.gbkb", bot_name))
|
|
|
|
|
.join(kb_name);
|
|
|
|
|
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let result = self
|
|
|
|
|
.indexer
|
|
|
|
|
.index_kb_folder(bot_name, kb_name, &local_path)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
info!(
|
|
|
|
|
"Indexed {} documents ({} chunks) into collection {}",
|
|
|
|
|
result.documents_processed, result.chunks_indexed, result.collection_name
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
}
|