botserver/src/drive/vectordb.rs
Rodrigo Rodriguez (Pragmatismo) 4424fcf548 Fix compiler warnings and improve code consistency
- Remove unused imports and comment them for potential future use
- Add missing .send() to HTTP request chain
- Fix integer type suffixes for JSON values
- Simplify async execution by using tokio::block_in_place
- Remove unused function parameters to eliminate warnings
- Extract temporary variables to avoid borrowing issues
- Add placeholder methods to SessionManager for analytics
- Implement real database operations for admin endpoints
- Remove duplicate or conflicting type definitions

These changes address all compiler warnings while maintaining the
existing functionality and preparing the codebase for future
enhancements in areas like analytics and session management.
2025-11-27 08:34:24 -03:00

587 lines
19 KiB
Rust

use anyhow::Result;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
// use std::sync::Arc; // Unused import
use tokio::fs;
use uuid::Uuid;
#[cfg(feature = "vectordb")]
use qdrant_client::{
prelude::*,
qdrant::{vectors_config::Config, CreateCollection, Distance, VectorParams, VectorsConfig},
};
/// File metadata for vector DB indexing
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileDocument {
pub id: String,
pub file_path: String,
pub file_name: String,
pub file_type: String,
pub file_size: u64,
pub bucket: String,
pub content_text: String,
pub content_summary: Option<String>,
pub created_at: DateTime<Utc>,
pub modified_at: DateTime<Utc>,
pub indexed_at: DateTime<Utc>,
pub mime_type: Option<String>,
pub tags: Vec<String>,
}
/// File search query
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileSearchQuery {
pub query_text: String,
pub bucket: Option<String>,
pub file_type: Option<String>,
pub date_from: Option<DateTime<Utc>>,
pub date_to: Option<DateTime<Utc>>,
pub tags: Vec<String>,
pub limit: usize,
}
/// File search result
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileSearchResult {
pub file: FileDocument,
pub score: f32,
pub snippet: String,
pub highlights: Vec<String>,
}
/// Per-user drive vector DB manager
pub struct UserDriveVectorDB {
user_id: Uuid,
bot_id: Uuid,
collection_name: String,
db_path: PathBuf,
#[cfg(feature = "vectordb")]
client: Option<Arc<QdrantClient>>,
}
impl UserDriveVectorDB {
/// Create new user drive vector DB instance
pub fn new(user_id: Uuid, bot_id: Uuid, db_path: PathBuf) -> Self {
let collection_name = format!("drive_{}_{}", bot_id, user_id);
Self {
user_id,
bot_id,
collection_name,
db_path,
#[cfg(feature = "vectordb")]
client: None,
}
}
/// Initialize vector DB collection
#[cfg(feature = "vectordb")]
pub async fn initialize(&mut self, qdrant_url: &str) -> Result<()> {
let client = QdrantClient::from_url(qdrant_url).build()?;
// Check if collection exists
let collections = client.list_collections().await?;
let exists = collections
.collections
.iter()
.any(|c| c.name == self.collection_name);
if !exists {
// Create collection for file embeddings (1536 dimensions for OpenAI embeddings)
client
.create_collection(&CreateCollection {
collection_name: self.collection_name.clone(),
vectors_config: Some(VectorsConfig {
config: Some(Config::Params(VectorParams {
size: 1536,
distance: Distance::Cosine.into(),
..Default::default()
})),
}),
..Default::default()
})
.await?;
log::info!("Created drive vector collection: {}", self.collection_name);
}
self.client = Some(Arc::new(client));
Ok(())
}
#[cfg(not(feature = "vectordb"))]
pub async fn initialize(&mut self, _qdrant_url: &str) -> Result<()> {
log::warn!("Vector DB feature not enabled, using fallback storage");
fs::create_dir_all(&self.db_path).await?;
Ok(())
}
/// Index a single file (on-demand)
#[cfg(feature = "vectordb")]
pub async fn index_file(&self, file: &FileDocument, embedding: Vec<f32>) -> Result<()> {
let client = self
.client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Vector DB not initialized"))?;
let point = PointStruct::new(file.id.clone(), embedding, serde_json::to_value(file)?);
client
.upsert_points_blocking(self.collection_name.clone(), vec![point], None)
.await?;
log::debug!("Indexed file: {} - {}", file.id, file.file_name);
Ok(())
}
#[cfg(not(feature = "vectordb"))]
pub async fn index_file(&self, file: &FileDocument, _embedding: Vec<f32>) -> Result<()> {
// Fallback: Store in JSON file
let file_path = self.db_path.join(format!("{}.json", file.id));
let json = serde_json::to_string_pretty(file)?;
fs::write(file_path, json).await?;
Ok(())
}
/// Index multiple files in batch
pub async fn index_files_batch(&self, files: &[(FileDocument, Vec<f32>)]) -> Result<()> {
#[cfg(feature = "vectordb")]
{
let client = self
.client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Vector DB not initialized"))?;
let points: Vec<PointStruct> = files
.iter()
.filter_map(|(file, embedding)| {
serde_json::to_value(file).ok().map(|payload| {
PointStruct::new(file.id.clone(), embedding.clone(), payload)
})
})
.collect();
if !points.is_empty() {
client
.upsert_points_blocking(self.collection_name.clone(), points, None)
.await?;
}
}
#[cfg(not(feature = "vectordb"))]
{
for (file, embedding) in files {
self.index_file(file, embedding.clone()).await?;
}
}
Ok(())
}
/// Search files using vector similarity
#[cfg(feature = "vectordb")]
pub async fn search(
&self,
query: &FileSearchQuery,
query_embedding: Vec<f32>,
) -> Result<Vec<FileSearchResult>> {
let client = self
.client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Vector DB not initialized"))?;
// Build filter if specified
let mut filter = None;
if query.bucket.is_some() || query.file_type.is_some() || !query.tags.is_empty() {
let mut conditions = vec![];
if let Some(bucket) = &query.bucket {
conditions.push(qdrant_client::qdrant::Condition::matches(
"bucket",
bucket.clone(),
));
}
if let Some(file_type) = &query.file_type {
conditions.push(qdrant_client::qdrant::Condition::matches(
"file_type",
file_type.clone(),
));
}
for tag in &query.tags {
conditions.push(qdrant_client::qdrant::Condition::matches(
"tags",
tag.clone(),
));
}
if !conditions.is_empty() {
filter = Some(qdrant_client::qdrant::Filter::must(conditions));
}
}
let search_result = client
.search_points(&qdrant_client::qdrant::SearchPoints {
collection_name: self.collection_name.clone(),
vector: query_embedding,
limit: query.limit as u64,
filter,
with_payload: Some(true.into()),
..Default::default()
})
.await?;
let mut results = Vec::new();
for point in search_result.result {
if let Some(payload) = point.payload {
let file: FileDocument = serde_json::from_value(serde_json::to_value(&payload)?)?;
// Create snippet and highlights
let snippet = self.create_snippet(&file.content_text, &query.query_text, 200);
let highlights = self.extract_highlights(&file.content_text, &query.query_text, 3);
results.push(FileSearchResult {
file,
score: point.score,
snippet,
highlights,
});
}
}
Ok(results)
}
#[cfg(not(feature = "vectordb"))]
pub async fn search(
&self,
query: &FileSearchQuery,
_query_embedding: Vec<f32>,
) -> Result<Vec<FileSearchResult>> {
// Fallback: Simple text search in JSON files
let mut results = Vec::new();
let mut entries = fs::read_dir(&self.db_path).await?;
while let Some(entry) = entries.next_entry().await? {
if entry.path().extension().and_then(|s| s.to_str()) == Some("json") {
let content = fs::read_to_string(entry.path()).await?;
if let Ok(file) = serde_json::from_str::<FileDocument>(&content) {
// Apply filters
if let Some(bucket) = &query.bucket {
if &file.bucket != bucket {
continue;
}
}
if let Some(file_type) = &query.file_type {
if &file.file_type != file_type {
continue;
}
}
// Simple text matching
let query_lower = query.query_text.to_lowercase();
if file.file_name.to_lowercase().contains(&query_lower)
|| file.content_text.to_lowercase().contains(&query_lower)
|| file
.content_summary
.as_ref()
.map_or(false, |s| s.to_lowercase().contains(&query_lower))
{
let snippet =
self.create_snippet(&file.content_text, &query.query_text, 200);
let highlights =
self.extract_highlights(&file.content_text, &query.query_text, 3);
results.push(FileSearchResult {
file,
score: 1.0,
snippet,
highlights,
});
}
}
if results.len() >= query.limit {
break;
}
}
}
Ok(results)
}
/// Create a snippet around the query match
fn create_snippet(&self, content: &str, query: &str, max_length: usize) -> String {
let content_lower = content.to_lowercase();
let query_lower = query.to_lowercase();
if let Some(pos) = content_lower.find(&query_lower) {
let start = pos.saturating_sub(max_length / 2);
let end = (pos + query.len() + max_length / 2).min(content.len());
let snippet = &content[start..end];
if start > 0 && end < content.len() {
format!("...{}...", snippet)
} else if start > 0 {
format!("...{}", snippet)
} else if end < content.len() {
format!("{}...", snippet)
} else {
snippet.to_string()
}
} else if content.len() > max_length {
format!("{}...", &content[..max_length])
} else {
content.to_string()
}
}
/// Extract highlighted segments containing the query
fn extract_highlights(&self, content: &str, query: &str, max_highlights: usize) -> Vec<String> {
let content_lower = content.to_lowercase();
let query_lower = query.to_lowercase();
let mut highlights = Vec::new();
let mut pos = 0;
while let Some(found_pos) = content_lower[pos..].find(&query_lower) {
let actual_pos = pos + found_pos;
let start = actual_pos.saturating_sub(40);
let end = (actual_pos + query.len() + 40).min(content.len());
highlights.push(content[start..end].to_string());
if highlights.len() >= max_highlights {
break;
}
pos = actual_pos + query.len();
}
highlights
}
/// Delete file from index
#[cfg(feature = "vectordb")]
pub async fn delete_file(&self, file_id: &str) -> Result<()> {
let client = self
.client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Vector DB not initialized"))?;
client
.delete_points(
self.collection_name.clone(),
&vec![file_id.into()].into(),
None,
)
.await?;
log::debug!("Deleted file from index: {}", file_id);
Ok(())
}
#[cfg(not(feature = "vectordb"))]
pub async fn delete_file(&self, file_id: &str) -> Result<()> {
let file_path = self.db_path.join(format!("{}.json", file_id));
if file_path.exists() {
fs::remove_file(file_path).await?;
}
Ok(())
}
/// Get indexed file count
#[cfg(feature = "vectordb")]
pub async fn get_count(&self) -> Result<u64> {
let client = self
.client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Vector DB not initialized"))?;
let info = client.collection_info(self.collection_name.clone()).await?;
Ok(info.result.unwrap().points_count.unwrap_or(0))
}
#[cfg(not(feature = "vectordb"))]
pub async fn get_count(&self) -> Result<u64> {
let mut count = 0;
let mut entries = fs::read_dir(&self.db_path).await?;
while let Some(entry) = entries.next_entry().await? {
if entry.path().extension().and_then(|s| s.to_str()) == Some("json") {
count += 1;
}
}
Ok(count)
}
/// Update file metadata without re-indexing content
pub async fn update_file_metadata(&self, file_id: &str, tags: Vec<String>) -> Result<()> {
// Read existing file
#[cfg(not(feature = "vectordb"))]
{
let file_path = self.db_path.join(format!("{}.json", file_id));
if file_path.exists() {
let content = fs::read_to_string(&file_path).await?;
let mut file: FileDocument = serde_json::from_str(&content)?;
file.tags = tags;
let json = serde_json::to_string_pretty(&file)?;
fs::write(file_path, json).await?;
}
}
#[cfg(feature = "vectordb")]
{
// Update payload in Qdrant
log::warn!("Metadata update not yet implemented for Qdrant backend");
}
Ok(())
}
/// Clear all indexed files
#[cfg(feature = "vectordb")]
pub async fn clear(&self) -> Result<()> {
let client = self
.client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Vector DB not initialized"))?;
client
.delete_collection(self.collection_name.clone())
.await?;
// Recreate empty collection
client
.create_collection(&CreateCollection {
collection_name: self.collection_name.clone(),
vectors_config: Some(VectorsConfig {
config: Some(Config::Params(VectorParams {
size: 1536,
distance: Distance::Cosine.into(),
..Default::default()
})),
}),
..Default::default()
})
.await?;
log::info!("Cleared drive vector collection: {}", self.collection_name);
Ok(())
}
#[cfg(not(feature = "vectordb"))]
pub async fn clear(&self) -> Result<()> {
if self.db_path.exists() {
fs::remove_dir_all(&self.db_path).await?;
fs::create_dir_all(&self.db_path).await?;
}
Ok(())
}
}
/// File content extractor for different file types
pub struct FileContentExtractor;
impl FileContentExtractor {
/// Extract text content from file based on type
pub async fn extract_text(file_path: &PathBuf, mime_type: &str) -> Result<String> {
match mime_type {
// Plain text files
"text/plain" | "text/markdown" | "text/csv" => {
let content = fs::read_to_string(file_path).await?;
Ok(content)
}
// Code files
t if t.starts_with("text/") => {
let content = fs::read_to_string(file_path).await?;
Ok(content)
}
// TODO: Add support for:
// - PDF extraction
// - Word document extraction
// - Excel/spreadsheet extraction
// - Images (OCR)
// - Audio (transcription)
_ => {
log::warn!("Unsupported file type for indexing: {}", mime_type);
Ok(String::new())
}
}
}
/// Determine if file should be indexed based on type
pub fn should_index(mime_type: &str, file_size: u64) -> bool {
// Skip very large files (> 10MB)
if file_size > 10 * 1024 * 1024 {
return false;
}
// Index text-based files
matches!(
mime_type,
"text/plain"
| "text/markdown"
| "text/csv"
| "text/html"
| "application/json"
| "text/x-python"
| "text/x-rust"
| "text/javascript"
| "text/x-java"
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_file_document_creation() {
let file = FileDocument {
id: "test-123".to_string(),
file_path: "/test/file.txt".to_string(),
file_name: "file.txt".to_string(),
file_type: "text".to_string(),
file_size: 1024,
bucket: "test-bucket".to_string(),
content_text: "Test file content".to_string(),
content_summary: Some("Summary".to_string()),
created_at: Utc::now(),
modified_at: Utc::now(),
indexed_at: Utc::now(),
mime_type: Some("text/plain".to_string()),
tags: vec!["test".to_string()],
};
assert_eq!(file.id, "test-123");
assert_eq!(file.file_name, "file.txt");
}
#[test]
fn test_should_index() {
assert!(FileContentExtractor::should_index("text/plain", 1024));
assert!(FileContentExtractor::should_index("text/markdown", 5000));
assert!(!FileContentExtractor::should_index(
"text/plain",
20 * 1024 * 1024
));
assert!(!FileContentExtractor::should_index("video/mp4", 1024));
}
#[tokio::test]
async fn test_user_drive_vectordb_creation() {
let temp_dir = std::env::temp_dir().join("test_drive_vectordb");
let db = UserDriveVectorDB::new(Uuid::new_v4(), Uuid::new_v4(), temp_dir);
assert!(db.collection_name.starts_with("drive_"));
}
}