//! BM25 Configuration for Tantivy-based sparse retrieval //! //! This module provides configuration for BM25 text search powered by Tantivy. //! Tantivy is a full-text search engine library (like Lucene) that implements //! the BM25 ranking algorithm. //! //! # Config.csv Parameters //! //! | Parameter | Default | Description | //! |-----------|---------|-------------| //! | `bm25-enabled` | `true` | Enable/disable BM25 sparse search | //! | `bm25-k1` | `1.2` | Term frequency saturation (0.5-3.0 typical) | //! | `bm25-b` | `0.75` | Document length normalization (0.0-1.0) | //! | `bm25-stemming` | `true` | Apply stemming to terms | //! | `bm25-stopwords` | `true` | Filter common stopwords | //! //! # Example config.csv //! //! ```csv //! bm25-enabled,true //! bm25-k1,1.2 //! bm25-b,0.75 //! bm25-stemming,true //! bm25-stopwords,true //! ``` //! //! # Switching BM25 On/Off //! //! To **disable** BM25 sparse search (use only dense/embedding search): //! ```csv //! bm25-enabled,false //! ``` //! //! To **enable** BM25 with custom tuning: //! ```csv //! bm25-enabled,true //! bm25-k1,1.5 //! bm25-b,0.5 //! ``` //! //! # How It Works //! //! When `bm25-enabled=true`: //! - Hybrid search uses BOTH BM25 (keyword) + Qdrant (embedding) results //! - Results are merged using Reciprocal Rank Fusion (RRF) //! - Good for queries where exact keyword matches matter //! //! When `bm25-enabled=false`: //! - Only dense (embedding) search via Qdrant is used //! - Faster but may miss exact keyword matches //! - Better for semantic/conceptual queries use diesel::prelude::*; use log::{debug, warn}; use serde::{Deserialize, Serialize}; use uuid::Uuid; use crate::shared::utils::DbPool; /// Configuration for BM25 sparse retrieval (powered by Tantivy) /// /// BM25 (Best Matching 25) is a ranking function used for information retrieval. /// This configuration controls the Tantivy-based BM25 implementation. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Bm25Config { /// Whether BM25 sparse search is enabled /// When false, only dense (embedding) search is used pub enabled: bool, /// Term frequency saturation parameter (typically 1.2-2.0) /// - Higher values: more weight to term frequency /// - Lower values: diminishing returns for repeated terms /// - Tantivy default: 1.2 pub k1: f32, /// Document length normalization parameter (0.0-1.0) /// - 0.0: no length normalization /// - 1.0: full length normalization (penalizes long documents) /// - Tantivy default: 0.75 pub b: f32, /// Whether to apply stemming to terms before indexing/searching /// Stemming reduces words to their root form (e.g., "running" → "run") pub stemming: bool, /// Whether to filter out common stopwords (e.g., "the", "a", "is") pub stopwords: bool, } impl Default for Bm25Config { fn default() -> Self { Self { enabled: true, k1: 1.2, b: 0.75, stemming: true, stopwords: true, } } } impl Bm25Config { /// Load BM25 configuration from bot_configuration table /// /// Reads parameters: `bm25-enabled`, `bm25-k1`, `bm25-b`, `bm25-stemming`, `bm25-stopwords` pub fn from_bot_config(pool: &DbPool, target_bot_id: &Uuid) -> Self { let mut config = Self::default(); let mut conn = match pool.get() { Ok(c) => c, Err(e) => { warn!("Failed to get database connection for BM25 config: {}", e); return config; } }; #[derive(QueryableByName)] struct ConfigRow { #[diesel(sql_type = diesel::sql_types::Text)] config_key: String, #[diesel(sql_type = diesel::sql_types::Text)] config_value: String, } let configs: Vec = diesel::sql_query( "SELECT config_key, config_value FROM bot_configuration \ WHERE bot_id = $1 AND config_key LIKE 'bm25-%'", ) .bind::(target_bot_id) .load(&mut conn) .unwrap_or_default(); for row in configs { match row.config_key.as_str() { "bm25-enabled" => { config.enabled = row.config_value.to_lowercase() == "true"; debug!("BM25 enabled: {}", config.enabled); } "bm25-k1" => { config.k1 = row.config_value.parse().unwrap_or(1.2); debug!("BM25 k1: {}", config.k1); } "bm25-b" => { config.b = row.config_value.parse().unwrap_or(0.75); debug!("BM25 b: {}", config.b); } "bm25-stemming" => { config.stemming = row.config_value.to_lowercase() == "true"; debug!("BM25 stemming: {}", config.stemming); } "bm25-stopwords" => { config.stopwords = row.config_value.to_lowercase() == "true"; debug!("BM25 stopwords: {}", config.stopwords); } _ => {} } } // Validate and clamp values config.validate(); config } /// Create config with BM25 disabled (dense-only search) pub fn disabled() -> Self { Self { enabled: false, ..Default::default() } } /// Create config with custom k1 and b parameters pub fn with_params(k1: f32, b: f32) -> Self { let mut config = Self { k1, b, ..Default::default() }; config.validate(); config } /// Validate and clamp configuration values to sensible ranges fn validate(&mut self) { // k1 should be positive, typically between 0.5 and 3.0 if self.k1 < 0.0 { warn!("BM25 k1 cannot be negative, setting to default 1.2"); self.k1 = 1.2; } else if self.k1 > 10.0 { warn!("BM25 k1 {} is unusually high, capping at 10.0", self.k1); self.k1 = 10.0; } // b should be between 0.0 and 1.0 if self.b < 0.0 { warn!("BM25 b cannot be negative, setting to 0.0"); self.b = 0.0; } else if self.b > 1.0 { warn!("BM25 b cannot exceed 1.0, capping at 1.0"); self.b = 1.0; } } /// Check if BM25 should be used in hybrid search pub fn is_enabled(&self) -> bool { self.enabled } /// Check if text preprocessing is enabled pub fn has_preprocessing(&self) -> bool { self.stemming || self.stopwords } /// Get a description of the current configuration pub fn describe(&self) -> String { if self.enabled { format!( "BM25(k1={}, b={}, stemming={}, stopwords={})", self.k1, self.b, self.stemming, self.stopwords ) } else { "BM25(disabled)".to_string() } } } /// Common English stopwords for filtering /// Used when `bm25-stopwords=true` pub const DEFAULT_STOPWORDS: &[&str] = &[ "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "or", "that", "the", "to", "was", "were", "will", "with", "this", "but", "they", "have", "had", "what", "when", "where", "who", "which", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "just", "can", "should", "now", "do", "does", "did", "done", "been", "being", "would", "could", "might", "must", "shall", "may", "am", "your", "our", "their", "his", "her", "my", "me", "him", "them", "us", "you", "i", "we", "she", "if", "then", "else", "about", "into", "over", "after", "before", "between", "under", "again", "further", "once", ]; /// Check if a word is a common stopword pub fn is_stopword(word: &str) -> bool { DEFAULT_STOPWORDS.contains(&word.to_lowercase().as_str()) }