2025-11-26 22:54:22 -03:00
|
|
|
use anyhow::{Context, Result};
|
|
|
|
|
use log::{debug, info, warn};
|
|
|
|
|
use reqwest::Client;
|
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
|
|
|
|
|
use std::sync::Arc;
|
|
|
|
|
use tokio::sync::Semaphore;
|
|
|
|
|
|
|
|
|
|
use super::document_processor::TextChunk;
|
|
|
|
|
|
|
|
|
|
/// Embedding model configuration
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
|
pub struct EmbeddingConfig {
|
|
|
|
|
/// URL for the embedding service (e.g., http://localhost:8082)
|
|
|
|
|
pub embedding_url: String,
|
|
|
|
|
/// Model name/path for embeddings (e.g., bge-small-en-v1.5)
|
|
|
|
|
pub embedding_model: String,
|
|
|
|
|
/// Dimension of embeddings (e.g., 384, 768, 1536)
|
|
|
|
|
pub dimensions: usize,
|
|
|
|
|
/// Maximum batch size for embedding generation
|
|
|
|
|
pub batch_size: usize,
|
|
|
|
|
/// Request timeout in seconds
|
|
|
|
|
pub timeout_seconds: u64,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Default for EmbeddingConfig {
|
|
|
|
|
fn default() -> Self {
|
|
|
|
|
Self {
|
|
|
|
|
embedding_url: "http://localhost:8082".to_string(),
|
|
|
|
|
embedding_model: "bge-small-en-v1.5".to_string(),
|
|
|
|
|
dimensions: 384, // Default for bge-small
|
|
|
|
|
batch_size: 32,
|
|
|
|
|
timeout_seconds: 30,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl EmbeddingConfig {
|
|
|
|
|
/// Create config from environment or config.csv values
|
|
|
|
|
pub fn from_env() -> Self {
|
Add .env.example with comprehensive configuration template
The commit adds a complete example environment configuration file
documenting all available settings for BotServer, including logging,
database, server, drive, LLM, Redis, email, and feature flags.
Also removes hardcoded environment variable usage throughout the
codebase, replacing them with configuration via config.csv or
appropriate defaults. This includes:
- WhatsApp, Teams, Instagram adapter configurations
- Weather API key handling
- Email and directory service configurations
- Console feature conditionally compiles monitoring code
- Improved logging configuration with library suppression
2025-11-28 13:19:03 -03:00
|
|
|
// Use defaults - can be configured via config.csv if needed
|
|
|
|
|
let embedding_url = "http://localhost:8082".to_string();
|
2025-11-26 22:54:22 -03:00
|
|
|
|
Add .env.example with comprehensive configuration template
The commit adds a complete example environment configuration file
documenting all available settings for BotServer, including logging,
database, server, drive, LLM, Redis, email, and feature flags.
Also removes hardcoded environment variable usage throughout the
codebase, replacing them with configuration via config.csv or
appropriate defaults. This includes:
- WhatsApp, Teams, Instagram adapter configurations
- Weather API key handling
- Email and directory service configurations
- Console feature conditionally compiles monitoring code
- Improved logging configuration with library suppression
2025-11-28 13:19:03 -03:00
|
|
|
let embedding_model = "bge-small-en-v1.5".to_string();
|
2025-11-26 22:54:22 -03:00
|
|
|
|
|
|
|
|
// Detect dimensions based on model name
|
|
|
|
|
let dimensions = Self::detect_dimensions(&embedding_model);
|
|
|
|
|
|
|
|
|
|
Self {
|
|
|
|
|
embedding_url,
|
|
|
|
|
embedding_model,
|
|
|
|
|
dimensions,
|
|
|
|
|
batch_size: 32,
|
|
|
|
|
timeout_seconds: 30,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Detect embedding dimensions based on model name
|
|
|
|
|
fn detect_dimensions(model: &str) -> usize {
|
|
|
|
|
if model.contains("small") || model.contains("MiniLM") {
|
|
|
|
|
384
|
|
|
|
|
} else if model.contains("base") || model.contains("mpnet") {
|
|
|
|
|
768
|
|
|
|
|
} else if model.contains("large") || model.contains("ada") {
|
|
|
|
|
1536
|
|
|
|
|
} else {
|
|
|
|
|
384 // Default
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Request payload for embedding generation
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
struct EmbeddingRequest {
|
|
|
|
|
input: Vec<String>,
|
|
|
|
|
model: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Response from embedding service
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
|
struct EmbeddingResponse {
|
|
|
|
|
data: Vec<EmbeddingData>,
|
|
|
|
|
model: String,
|
|
|
|
|
usage: Option<EmbeddingUsage>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
|
struct EmbeddingData {
|
|
|
|
|
embedding: Vec<f32>,
|
2025-11-27 23:10:43 -03:00
|
|
|
_index: usize,
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
|
struct EmbeddingUsage {
|
2025-11-27 23:10:43 -03:00
|
|
|
_prompt_tokens: usize,
|
2025-11-26 22:54:22 -03:00
|
|
|
total_tokens: usize,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generated embedding with metadata
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct Embedding {
|
|
|
|
|
pub vector: Vec<f32>,
|
|
|
|
|
pub dimensions: usize,
|
|
|
|
|
pub model: String,
|
|
|
|
|
pub tokens_used: Option<usize>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Knowledge base embedding generator
|
|
|
|
|
pub struct KbEmbeddingGenerator {
|
|
|
|
|
config: EmbeddingConfig,
|
|
|
|
|
client: Client,
|
|
|
|
|
semaphore: Arc<Semaphore>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl std::fmt::Debug for KbEmbeddingGenerator {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
|
f.debug_struct("KbEmbeddingGenerator")
|
|
|
|
|
.field("config", &self.config)
|
|
|
|
|
.field("client", &"Client")
|
|
|
|
|
.field("semaphore", &"Semaphore")
|
|
|
|
|
.finish()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl KbEmbeddingGenerator {
|
|
|
|
|
pub fn new(config: EmbeddingConfig) -> Self {
|
|
|
|
|
let client = Client::builder()
|
|
|
|
|
.timeout(std::time::Duration::from_secs(config.timeout_seconds))
|
|
|
|
|
.build()
|
|
|
|
|
.expect("Failed to create HTTP client");
|
|
|
|
|
|
|
|
|
|
// Limit concurrent requests
|
|
|
|
|
let semaphore = Arc::new(Semaphore::new(4));
|
|
|
|
|
|
|
|
|
|
Self {
|
|
|
|
|
config,
|
|
|
|
|
client,
|
|
|
|
|
semaphore,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embeddings for text chunks
|
|
|
|
|
pub async fn generate_embeddings(
|
|
|
|
|
&self,
|
|
|
|
|
chunks: &[TextChunk],
|
|
|
|
|
) -> Result<Vec<(TextChunk, Embedding)>> {
|
|
|
|
|
if chunks.is_empty() {
|
|
|
|
|
return Ok(Vec::new());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!("Generating embeddings for {} chunks", chunks.len());
|
|
|
|
|
|
|
|
|
|
let mut results = Vec::new();
|
|
|
|
|
|
|
|
|
|
// Process in batches
|
|
|
|
|
for batch in chunks.chunks(self.config.batch_size) {
|
|
|
|
|
let batch_embeddings = self.generate_batch_embeddings(batch).await?;
|
|
|
|
|
|
|
|
|
|
// Pair chunks with their embeddings
|
|
|
|
|
for (chunk, embedding) in batch.iter().zip(batch_embeddings.iter()) {
|
|
|
|
|
results.push((chunk.clone(), embedding.clone()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!("Generated {} embeddings", results.len());
|
|
|
|
|
|
|
|
|
|
Ok(results)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embeddings for a batch of chunks
|
|
|
|
|
async fn generate_batch_embeddings(&self, chunks: &[TextChunk]) -> Result<Vec<Embedding>> {
|
|
|
|
|
let _permit = self.semaphore.acquire().await?;
|
|
|
|
|
|
|
|
|
|
let texts: Vec<String> = chunks.iter().map(|c| c.content.clone()).collect();
|
|
|
|
|
|
|
|
|
|
debug!("Generating embeddings for batch of {} texts", texts.len());
|
|
|
|
|
|
|
|
|
|
// Try local embedding service first
|
|
|
|
|
match self.generate_local_embeddings(&texts).await {
|
|
|
|
|
Ok(embeddings) => Ok(embeddings),
|
|
|
|
|
Err(e) => {
|
|
|
|
|
warn!("Local embedding service failed: {}, trying OpenAI API", e);
|
|
|
|
|
self.generate_openai_embeddings(&texts).await
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embeddings using local service
|
|
|
|
|
async fn generate_local_embeddings(&self, texts: &[String]) -> Result<Vec<Embedding>> {
|
|
|
|
|
let request = EmbeddingRequest {
|
|
|
|
|
input: texts.to_vec(),
|
|
|
|
|
model: self.config.embedding_model.clone(),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let response = self
|
|
|
|
|
.client
|
|
|
|
|
.post(&format!("{}/embeddings", self.config.embedding_url))
|
|
|
|
|
.json(&request)
|
|
|
|
|
.send()
|
|
|
|
|
.await
|
|
|
|
|
.context("Failed to send request to embedding service")?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
let status = response.status();
|
|
|
|
|
let error_text = response.text().await.unwrap_or_default();
|
|
|
|
|
return Err(anyhow::anyhow!(
|
|
|
|
|
"Embedding service error {}: {}",
|
|
|
|
|
status,
|
|
|
|
|
error_text
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let embedding_response: EmbeddingResponse = response
|
|
|
|
|
.json()
|
|
|
|
|
.await
|
|
|
|
|
.context("Failed to parse embedding response")?;
|
|
|
|
|
|
|
|
|
|
let mut embeddings = Vec::new();
|
|
|
|
|
for data in embedding_response.data {
|
|
|
|
|
embeddings.push(Embedding {
|
|
|
|
|
vector: data.embedding,
|
|
|
|
|
dimensions: self.config.dimensions,
|
|
|
|
|
model: embedding_response.model.clone(),
|
|
|
|
|
tokens_used: embedding_response.usage.as_ref().map(|u| u.total_tokens),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(embeddings)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embeddings using OpenAI API (fallback)
|
Add .env.example with comprehensive configuration template
The commit adds a complete example environment configuration file
documenting all available settings for BotServer, including logging,
database, server, drive, LLM, Redis, email, and feature flags.
Also removes hardcoded environment variable usage throughout the
codebase, replacing them with configuration via config.csv or
appropriate defaults. This includes:
- WhatsApp, Teams, Instagram adapter configurations
- Weather API key handling
- Email and directory service configurations
- Console feature conditionally compiles monitoring code
- Improved logging configuration with library suppression
2025-11-28 13:19:03 -03:00
|
|
|
async fn generate_openai_embeddings(&self, _texts: &[String]) -> Result<Vec<Embedding>> {
|
|
|
|
|
// OpenAI embeddings disabled - use local embedding service instead
|
|
|
|
|
Err(anyhow::anyhow!(
|
|
|
|
|
"OpenAI embeddings not configured - use local embedding service"
|
|
|
|
|
))
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embedding for a single text
|
|
|
|
|
pub async fn generate_single_embedding(&self, text: &str) -> Result<Embedding> {
|
|
|
|
|
let embeddings = self
|
|
|
|
|
.generate_batch_embeddings(&[TextChunk {
|
|
|
|
|
content: text.to_string(),
|
|
|
|
|
metadata: super::document_processor::ChunkMetadata {
|
|
|
|
|
document_path: "query".to_string(),
|
|
|
|
|
document_title: None,
|
|
|
|
|
chunk_index: 0,
|
|
|
|
|
total_chunks: 1,
|
|
|
|
|
start_char: 0,
|
|
|
|
|
end_char: text.len(),
|
|
|
|
|
page_number: None,
|
|
|
|
|
},
|
|
|
|
|
}])
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
embeddings
|
|
|
|
|
.into_iter()
|
|
|
|
|
.next()
|
|
|
|
|
.ok_or_else(|| anyhow::anyhow!("No embedding generated"))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generic embedding generator for other uses (email, etc.)
|
|
|
|
|
pub struct EmbeddingGenerator {
|
|
|
|
|
kb_generator: KbEmbeddingGenerator,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl std::fmt::Debug for EmbeddingGenerator {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
|
f.debug_struct("EmbeddingGenerator")
|
|
|
|
|
.field("kb_generator", &self.kb_generator)
|
|
|
|
|
.finish()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl EmbeddingGenerator {
|
|
|
|
|
pub fn new(llm_endpoint: String) -> Self {
|
|
|
|
|
let config = EmbeddingConfig {
|
|
|
|
|
embedding_url: llm_endpoint,
|
|
|
|
|
..Default::default()
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Self {
|
|
|
|
|
kb_generator: KbEmbeddingGenerator::new(config),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embedding for arbitrary text
|
|
|
|
|
pub async fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
|
|
|
|
|
let embedding = self.kb_generator.generate_single_embedding(text).await?;
|
|
|
|
|
Ok(embedding.vector)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Email-specific embedding generator (for compatibility)
|
|
|
|
|
pub struct EmailEmbeddingGenerator {
|
|
|
|
|
generator: EmbeddingGenerator,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl std::fmt::Debug for EmailEmbeddingGenerator {
|
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
|
f.debug_struct("EmailEmbeddingGenerator")
|
|
|
|
|
.field("generator", &self.generator)
|
|
|
|
|
.finish()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl EmailEmbeddingGenerator {
|
|
|
|
|
pub fn new(llm_endpoint: String) -> Self {
|
|
|
|
|
Self {
|
|
|
|
|
generator: EmbeddingGenerator::new(llm_endpoint),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embedding for email content
|
|
|
|
|
pub async fn generate_embedding(&self, email: &impl EmailLike) -> Result<Vec<f32>> {
|
|
|
|
|
let text = format!(
|
|
|
|
|
"Subject: {}\nFrom: {}\nTo: {}\n\n{}",
|
|
|
|
|
email.subject(),
|
|
|
|
|
email.from(),
|
|
|
|
|
email.to(),
|
|
|
|
|
email.body()
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
self.generator.generate_text_embedding(&text).await
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Generate embedding for text
|
|
|
|
|
pub async fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
|
|
|
|
|
self.generator.generate_text_embedding(text).await
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Trait for email-like objects
|
|
|
|
|
pub trait EmailLike {
|
|
|
|
|
fn subject(&self) -> &str;
|
|
|
|
|
fn from(&self) -> &str;
|
|
|
|
|
fn to(&self) -> &str;
|
|
|
|
|
fn body(&self) -> &str;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Simple email struct for testing
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
|
pub struct SimpleEmail {
|
|
|
|
|
pub id: String,
|
|
|
|
|
pub subject: String,
|
|
|
|
|
pub from: String,
|
|
|
|
|
pub to: String,
|
|
|
|
|
pub body: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl EmailLike for SimpleEmail {
|
|
|
|
|
fn subject(&self) -> &str {
|
|
|
|
|
&self.subject
|
|
|
|
|
}
|
|
|
|
|
fn from(&self) -> &str {
|
|
|
|
|
&self.from
|
|
|
|
|
}
|
|
|
|
|
fn to(&self) -> &str {
|
|
|
|
|
&self.to
|
|
|
|
|
}
|
|
|
|
|
fn body(&self) -> &str {
|
|
|
|
|
&self.body
|
|
|
|
|
}
|
|
|
|
|
}
|