Fix more clippy warnings: unused imports, raw string hashes, conditional imports

2025-12-24 09:42:16 -03:00 · 2025-12-24 09:42:16 -03:00 · 3a260a5703
commit 3a260a5703
parent 3a8510d191
3 changed files with 37 additions and 162 deletions
--- a/src/email/vectordb.rs
+++ b/src/email/vectordb.rs
@ -3,6 +3,7 @@ use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use std::path::PathBuf;
 use std::sync::Arc;
+#[cfg(not(feature = "vectordb"))]
 use tokio::fs;
 use uuid::Uuid;

--- a/src/vector-db/hybrid_search.rs
+++ b/src/vector-db/hybrid_search.rs
@ -1,54 +1,12 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-use log::{debug, error, info, trace, warn};
+use log::{debug, info, warn};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
-use std::sync::Arc;
 use uuid::Uuid;

 use crate::shared::state::AppState;

-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct HybridSearchConfig {
-
    pub dense_weight: f32,

    pub sparse_weight: f32,
@ -82,7 +40,6 @@ impl Default for HybridSearchConfig {
 }

 impl HybridSearchConfig {
-
    pub fn from_bot_config(state: &AppState, bot_id: Uuid) -> Self {
        use diesel::prelude::*;

@ -136,7 +93,6 @@ impl HybridSearchConfig {
            }
        }

-
        let total = config.dense_weight + config.sparse_weight;
        if total > 0.0 {
            config.dense_weight /= total;
@ -151,21 +107,17 @@ impl HybridSearchConfig {
        config
    }

-
    pub fn use_sparse_search(&self) -> bool {
        self.bm25_enabled && self.sparse_weight > 0.0
    }

-
    pub fn use_dense_search(&self) -> bool {
        self.dense_weight > 0.0
    }
 }

-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SearchResult {
-
    pub doc_id: String,

    pub content: String,
@ -179,7 +131,6 @@ pub struct SearchResult {
    pub search_method: SearchMethod,
 }

-
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum SearchMethod {
    Dense,
@ -188,8 +139,6 @@ pub enum SearchMethod {
    Reranked,
 }

-
-
 pub struct BM25Index {
    doc_freq: HashMap<String, usize>,
    doc_count: usize,
@ -341,7 +290,6 @@ impl Default for BM25Index {
    }
 }

-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct BM25Stats {
    pub doc_count: usize,
@ -350,7 +298,6 @@ pub struct BM25Stats {
    pub enabled: bool,
 }

-
 #[derive(Debug, Clone)]
 struct DocumentEntry {
    pub content: String,
@ -358,9 +305,7 @@ struct DocumentEntry {
    pub metadata: HashMap<String, String>,
 }

-
 pub struct HybridSearchEngine {
-
    bm25_index: BM25Index,

    documents: HashMap<String, DocumentEntry>,
@ -391,7 +336,6 @@ impl HybridSearchEngine {
        }
    }

-
    pub async fn index_document(
        &mut self,
        doc_id: &str,
@ -400,10 +344,8 @@ impl HybridSearchEngine {
        metadata: HashMap<String, String>,
        embedding: Option<Vec<f32>>,
    ) -> Result<(), String> {
-
        self.bm25_index.add_document(doc_id, content, source);

-
        self.documents.insert(
            doc_id.to_string(),
            DocumentEntry {
@ -413,7 +355,6 @@ impl HybridSearchEngine {
            },
        );

-
        if let Some(emb) = embedding {
            self.upsert_to_qdrant(doc_id, &emb).await?;
        }
@ -421,12 +362,10 @@ impl HybridSearchEngine {
        Ok(())
    }

-
    pub fn commit(&mut self) -> Result<(), String> {
        Ok(())
    }

-
    pub async fn remove_document(&mut self, doc_id: &str) -> Result<(), String> {
        self.bm25_index.remove_document(doc_id);
        self.documents.remove(doc_id);
@ -434,7 +373,6 @@ impl HybridSearchEngine {
        Ok(())
    }

-
    pub async fn search(
        &self,
        query: &str,
@ -442,7 +380,6 @@ impl HybridSearchEngine {
    ) -> Result<Vec<SearchResult>, String> {
        let fetch_count = self.config.max_results * 3;

-
        let sparse_results: Vec<(String, f32)> = if self.config.use_sparse_search() {
            self.bm25_index
                .search(query, fetch_count)
@ -453,7 +390,6 @@ impl HybridSearchEngine {
            Vec::new()
        };

-
        let dense_results = if self.config.use_dense_search() {
            if let Some(embedding) = query_embedding {
                self.search_qdrant(&embedding, fetch_count).await?
@ -464,7 +400,6 @@ impl HybridSearchEngine {
            Vec::new()
        };

-
        let (results, method) = if sparse_results.is_empty() && dense_results.is_empty() {
            (Vec::new(), SearchMethod::Hybrid)
        } else if sparse_results.is_empty() {
@ -478,7 +413,6 @@ impl HybridSearchEngine {
            )
        };

-
        let mut search_results: Vec<SearchResult> = results
            .into_iter()
            .filter_map(|(doc_id, score)| {
@ -495,7 +429,6 @@ impl HybridSearchEngine {
            .take(self.config.max_results)
            .collect();

-
        if self.config.reranker_enabled && !search_results.is_empty() {
            search_results = self.rerank(query, search_results).await?;
        }
@ -503,7 +436,6 @@ impl HybridSearchEngine {
        Ok(search_results)
    }

-
    pub fn sparse_search(&self, query: &str) -> Vec<SearchResult> {
        let results = self.bm25_index.search(query, self.config.max_results);

@ -522,7 +454,6 @@ impl HybridSearchEngine {
            .collect()
    }

-
    pub async fn dense_search(
        &self,
        query_embedding: Vec<f32>,
@ -548,7 +479,6 @@ impl HybridSearchEngine {
        Ok(search_results)
    }

-
    fn reciprocal_rank_fusion(
        &self,
        sparse: &[(String, f32)],
@ -557,23 +487,19 @@ impl HybridSearchEngine {
        let k = self.config.rrf_k as f32;
        let mut scores: HashMap<String, f32> = HashMap::new();

-
        for (rank, (doc_id, _)) in sparse.iter().enumerate() {
            let rrf_score = self.config.sparse_weight / (k + rank as f32 + 1.0);
            *scores.entry(doc_id.clone()).or_insert(0.0) += rrf_score;
        }

-
        for (rank, (doc_id, _)) in dense.iter().enumerate() {
            let rrf_score = self.config.dense_weight / (k + rank as f32 + 1.0);
            *scores.entry(doc_id.clone()).or_insert(0.0) += rrf_score;
        }

-
        let mut results: Vec<(String, f32)> = scores.into_iter().collect();
        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

-
        let max_score = results.first().map(|(_, s)| *s).unwrap_or(0.0);
        if max_score > 0.0 {
            for (_, score) in &mut results {
@ -584,19 +510,18 @@ impl HybridSearchEngine {
        results
    }

-
    async fn rerank(
        &self,
        query: &str,
        results: Vec<SearchResult>,
    ) -> Result<Vec<SearchResult>, String> {
-
-
        let mut reranked = results;

        let query_lower = query.to_lowercase();
-        let query_terms: std::collections::HashSet<String> =
-            query_lower.split_whitespace().map(|s| s.to_string()).collect();
+        let query_terms: std::collections::HashSet<String> = query_lower
+            .split_whitespace()
+            .map(|s| s.to_string())
+            .collect();
        let query_terms_len = query_terms.len();

        for result in &mut reranked {
@ -623,7 +548,6 @@ impl HybridSearchEngine {
        Ok(reranked)
    }

-
    async fn search_qdrant(
        &self,
        embedding: &[f32],
@ -673,7 +597,6 @@ impl HybridSearchEngine {
        Ok(results)
    }

-
    async fn upsert_to_qdrant(&self, doc_id: &str, embedding: &[f32]) -> Result<(), String> {
        let client = reqwest::Client::new();

@ -702,7 +625,6 @@ impl HybridSearchEngine {
        Ok(())
    }

-
    async fn delete_from_qdrant(&self, doc_id: &str) -> Result<(), String> {
        let client = reqwest::Client::new();

@ -731,7 +653,6 @@ impl HybridSearchEngine {
        Ok(())
    }

-
    pub fn stats(&self) -> HybridSearchStats {
        let bm25_stats = self.bm25_index.stats();

@ -746,7 +667,6 @@ impl HybridSearchEngine {
    }
 }

-
 #[derive(Debug, Clone)]
 pub struct HybridSearchStats {
    pub total_documents: usize,
@ -757,7 +677,6 @@ pub struct HybridSearchStats {
    pub config: HybridSearchConfig,
 }

-
 pub struct QueryDecomposer {
    llm_endpoint: String,
    api_key: String,
@ -771,11 +690,9 @@ impl QueryDecomposer {
        }
    }

-
    pub async fn decompose(&self, query: &str) -> Result<Vec<String>, String> {
        let mut sub_queries = Vec::new();

-
        let conjunctions = ["and", "also", "as well as", "in addition to"];
        let mut parts: Vec<&str> = vec![query];

@ -821,7 +738,6 @@ impl QueryDecomposer {
        Ok(sub_queries)
    }

-
    pub fn synthesize(&self, query: &str, sub_answers: &[String]) -> String {
        if sub_answers.len() == 1 {
            return sub_answers[0].clone();
--- a/src/vector-db/vectordb_indexer.rs
+++ b/src/vector-db/vectordb_indexer.rs
@ -1,6 +1,5 @@
 use anyhow::Result;
 use chrono::{DateTime, Utc};
-use diesel::RunQueryDsl;
 use log::{error, info, warn};
 use std::collections::HashMap;
 use std::path::PathBuf;
@ -50,7 +49,6 @@ impl UserWorkspace {
    }
 }

-
 #[derive(Debug, Clone, PartialEq)]
 pub enum IndexingStatus {
    Idle,
@ -59,7 +57,6 @@ pub enum IndexingStatus {
    Failed(String),
 }

-
 #[derive(Debug, Clone)]
 pub struct IndexingStats {
    pub emails_indexed: u64,
@ -70,7 +67,6 @@ pub struct IndexingStats {
    pub errors: u64,
 }

-
 struct UserIndexingJob {
    user_id: Uuid,
    bot_id: Uuid,
@ -83,7 +79,6 @@ struct UserIndexingJob {
    status: IndexingStatus,
 }

-
 pub struct VectorDBIndexer {
    db_pool: DbPool,
    work_root: PathBuf,
@ -96,7 +91,6 @@ pub struct VectorDBIndexer {
 }

 impl VectorDBIndexer {
-
    pub fn new(
        db_pool: DbPool,
        work_root: PathBuf,
@ -115,7 +109,6 @@ impl VectorDBIndexer {
        }
    }

-
    pub async fn start(self: Arc<Self>) -> Result<()> {
        let mut running = self.running.write().await;
        if *running {
@ -135,17 +128,14 @@ impl VectorDBIndexer {
        Ok(())
    }

-
    pub async fn stop(&self) {
        let mut running = self.running.write().await;
        *running = false;
        info!("🛑 Stopping Vector DB Indexer");
    }

-
    async fn run_indexing_loop(self: Arc<Self>) {
        loop {
-
            {
                let running = self.running.read().await;
                if !*running {
@ -155,7 +145,6 @@ impl VectorDBIndexer {

            info!(" Running vector DB indexing cycle...");

-
            match self.get_active_users().await {
                Ok(users) => {
                    info!("Found {} active users to index", users.len());
@ -173,14 +162,12 @@ impl VectorDBIndexer {

            info!(" Indexing cycle complete");

-
            sleep(Duration::from_secs(self.interval_seconds)).await;
        }

        info!("Vector DB Indexer stopped");
    }

-
    async fn get_active_users(&self) -> Result<Vec<(Uuid, Uuid)>> {
        let conn = self.db_pool.clone();

@ -190,7 +177,6 @@ impl VectorDBIndexer {

            let mut db_conn = conn.get()?;

-
            let results: Vec<(Uuid, Uuid)> = user_sessions
                .select((user_id, bot_id))
                .distinct()
@ -201,11 +187,9 @@ impl VectorDBIndexer {
        .await?
    }

-
    async fn index_user_data(&self, user_id: Uuid, bot_id: Uuid) -> Result<()> {
        info!("Indexing user: {} (bot: {})", user_id, bot_id);

-
        let mut jobs = self.jobs.write().await;
        let job = jobs.entry(user_id).or_insert_with(|| {
            let workspace = UserWorkspace::new(self.work_root.clone(), &bot_id, &user_id);
@ -235,7 +219,6 @@ impl VectorDBIndexer {

        job.status = IndexingStatus::Running;

-
        if job.email_db.is_none() {
            let mut email_db =
                UserEmailVectorDB::new(user_id, bot_id, job.workspace.email_vectordb().into());
@ -264,17 +247,14 @@ impl VectorDBIndexer {

        drop(jobs);

-
        if let Err(e) = self.index_user_emails(user_id).await {
            error!("Failed to index emails for user {}: {}", user_id, e);
        }

-
        if let Err(e) = self.index_user_files(user_id).await {
            error!("Failed to index files for user {}: {}", user_id, e);
        }

-
        let mut jobs = self.jobs.write().await;
        if let Some(job) = jobs.get_mut(&user_id) {
            job.status = IndexingStatus::Idle;
@ -284,7 +264,6 @@ impl VectorDBIndexer {
        Ok(())
    }

-
    async fn index_user_emails(&self, user_id: Uuid) -> Result<()> {
        let jobs = self.jobs.read().await;
        let job = jobs
@ -299,7 +278,6 @@ impl VectorDBIndexer {
            }
        };

-
        let accounts = self.get_user_email_accounts(user_id).await?;

        info!(
@ -309,7 +287,6 @@ impl VectorDBIndexer {
        );

        for account_id in accounts {
-
            match self.get_unindexed_emails(user_id, &account_id).await {
                Ok(emails) => {
                    if emails.is_empty() {
@ -322,7 +299,6 @@ impl VectorDBIndexer {
                        account_id
                    );

-
                    for chunk in emails.chunks(self.batch_size) {
                        for email in chunk {
                            match self.embedding_generator.generate_embedding(&email).await {
@ -342,7 +318,6 @@ impl VectorDBIndexer {
                            }
                        }

-
                        sleep(Duration::from_millis(100)).await;
                    }
                }
@ -358,7 +333,6 @@ impl VectorDBIndexer {
        Ok(())
    }

-
    async fn index_user_files(&self, user_id: Uuid) -> Result<()> {
        let jobs = self.jobs.read().await;
        let job = jobs
@ -373,7 +347,6 @@ impl VectorDBIndexer {
            }
        };

-
        match self.get_unindexed_files(user_id).await {
            Ok(files) => {
                if files.is_empty() {
@ -382,16 +355,13 @@ impl VectorDBIndexer {

                info!("Indexing {} files for user {}", files.len(), user_id);

-
                for chunk in files.chunks(self.batch_size) {
                    for file in chunk {
-
                        let mime_type = file.mime_type.as_ref().map(|s| s.as_str()).unwrap_or("");
                        if !FileContentExtractor::should_index(&mime_type, file.file_size) {
                            continue;
                        }

-
                        let text = format!(
                            "File: {}\nType: {}\n\n{}",
                            file.file_name, file.file_type, file.content_text
@ -415,7 +385,6 @@ impl VectorDBIndexer {
                        }
                    }

-
                    sleep(Duration::from_millis(100)).await;
                }
            }
@ -427,7 +396,6 @@ impl VectorDBIndexer {
        Ok(())
    }

-
    async fn get_user_email_accounts(&self, user_id: Uuid) -> Result<Vec<String>> {
        let conn = self.db_pool.clone();

@ -490,7 +458,7 @@ impl VectorDBIndexer {
                folder: String,
            }

-            let query = r#"
+            let query = r"
                SELECT e.id, e.message_id, e.subject, e.from_address, e.to_addresses,
                       e.body_text, e.body_html, e.received_at, e.folder
                FROM emails e
@ -500,7 +468,7 @@ impl VectorDBIndexer {
                  AND (eis.indexed_at IS NULL OR eis.needs_reindex = true)
                ORDER BY e.received_at DESC
                LIMIT 100
-            "#;
+            ";

            let rows: Vec<EmailRow> = diesel::sql_query(query)
                .bind::<diesel::sql_types::Uuid, _>(user_id)
@ -510,20 +478,18 @@ impl VectorDBIndexer {

            let emails: Vec<EmailDocument> = rows
                .into_iter()
-                .map(|row| {
-                    EmailDocument {
-                        id: row.id.to_string(),
-                        account_id: account_id.clone(),
-                        from_email: row.from_address.clone(),
-                        from_name: row.from_address,
-                        to_email: row.to_addresses,
-                        subject: row.subject,
-                        body_text: row.body_text.unwrap_or_default(),
-                        date: row.received_at,
-                        folder: row.folder,
-                        has_attachments: false,
-                        thread_id: None,
-                    }
+                .map(|row| EmailDocument {
+                    id: row.id.to_string(),
+                    account_id: account_id.clone(),
+                    from_email: row.from_address.clone(),
+                    from_name: row.from_address,
+                    to_email: row.to_addresses,
+                    subject: row.subject,
+                    body_text: row.body_text.unwrap_or_default(),
+                    date: row.received_at,
+                    folder: row.folder,
+                    has_attachments: false,
+                    thread_id: None,
                })
                .collect();

@ -566,17 +532,16 @@ impl VectorDBIndexer {
                modified_at: DateTime<Utc>,
            }

-            let query = r#"
+            let query = r"
                SELECT f.id, f.file_path, f.file_name, f.file_type, f.file_size,
                       f.bucket, f.mime_type, f.created_at, f.modified_at
-                FROM files f
+                FROM user_files f
                LEFT JOIN file_index_status fis ON f.id = fis.file_id
                WHERE f.user_id = $1
                  AND (fis.indexed_at IS NULL OR fis.needs_reindex = true)
-                  AND f.file_size < 10485760
                ORDER BY f.modified_at DESC
                LIMIT 100
-            "#;
+            ";

            let rows: Vec<FileRow> = diesel::sql_query(query)
                .bind::<diesel::sql_types::Uuid, _>(user_id)
@ -585,22 +550,20 @@ impl VectorDBIndexer {

            let files: Vec<FileDocument> = rows
                .into_iter()
-                .map(|row| {
-                    FileDocument {
-                        id: row.id.to_string(),
-                        file_path: row.file_path,
-                        file_name: row.file_name,
-                        file_type: row.file_type,
-                        file_size: row.file_size as u64,
-                        bucket: row.bucket,
-                        content_text: String::new(),
-                        content_summary: None,
-                        created_at: row.created_at,
-                        modified_at: row.modified_at,
-                        indexed_at: Utc::now(),
-                        mime_type: row.mime_type,
-                        tags: Vec::new(),
-                    }
+                .map(|row| FileDocument {
+                    id: row.id.to_string(),
+                    file_path: row.file_path,
+                    file_name: row.file_name,
+                    file_type: row.file_type,
+                    file_size: row.file_size as u64,
+                    bucket: row.bucket,
+                    content_text: String::new(),
+                    content_summary: None,
+                    created_at: row.created_at,
+                    modified_at: row.modified_at,
+                    indexed_at: Utc::now(),
+                    mime_type: row.mime_type,
+                    tags: Vec::new(),
                })
                .collect();

@ -611,13 +574,11 @@ impl VectorDBIndexer {
        Ok(results)
    }

-
    pub async fn get_user_stats(&self, user_id: Uuid) -> Option<IndexingStats> {
        let jobs = self.jobs.read().await;
        jobs.get(&user_id).map(|job| job.stats.clone())
    }

-
    pub async fn get_overall_stats(&self) -> IndexingStats {
        let jobs = self.jobs.read().await;

@ -647,7 +608,6 @@ impl VectorDBIndexer {
        total_stats
    }

-
    pub async fn pause_user_indexing(&self, user_id: Uuid) -> Result<()> {
        let mut jobs = self.jobs.write().await;
        if let Some(job) = jobs.get_mut(&user_id) {
@ -657,7 +617,6 @@ impl VectorDBIndexer {
        Ok(())
    }

-
    pub async fn resume_user_indexing(&self, user_id: Uuid) -> Result<()> {
        let mut jobs = self.jobs.write().await;
        if let Some(job) = jobs.get_mut(&user_id) {
@ -667,7 +626,6 @@ impl VectorDBIndexer {
        Ok(())
    }

-
    pub async fn trigger_user_indexing(&self, user_id: Uuid, bot_id: Uuid) -> Result<()> {
        info!(" Triggering immediate indexing for user {}", user_id);
        self.index_user_data(user_id, bot_id).await