From ed5caa64b69e792d0e24e1619ebc687fbfa46e0d Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Mon, 8 Sep 2025 14:58:22 -0300 Subject: [PATCH] - Local LLM embeddings. --- src/main.rs | 15 +- src/scripts/containers/bot.sh | 3 +- src/scripts/containers/drive.sh | 5 +- src/scripts/containers/system.sh | 15 +- src/scripts/containers/tables.sh | 117 ++++----- src/services/llm_local.rs | 421 ++++++++++++++++++++----------- 6 files changed, 365 insertions(+), 211 deletions(-) mode change 100644 => 100755 src/scripts/containers/tables.sh diff --git a/src/main.rs b/src/main.rs index 0665e3b..7da0941 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,7 +10,8 @@ use sqlx::PgPool; use crate::services::automation::AutomationService; use crate::services::email::{get_emails, list_emails, save_click, send_email}; use crate::services::llm::{chat, chat_stream}; -use crate::services::llm_local::chat_completions_local; +use crate::services::llm_local::ensure_llama_servers_running; +use crate::services::llm_local::{chat_completions_local, embeddings_local}; use crate::services::llm_provider::chat_completions; use crate::services::web_automation::{initialize_browser_pool, BrowserPool}; @@ -38,14 +39,9 @@ async fn main() -> std::io::Result<()> { "/usr/bin/brave-browser-beta".to_string(), )); - #[cfg(feature = "local_llm")] - { - use crate::services::llm_local::ensure_llama_server_running; - - ensure_llama_server_running() - .await - .expect("Failed to initialize LLM local server."); - } + ensure_llama_servers_running() + .await + .expect("Failed to initialize LLM local server."); initialize_browser_pool() .await @@ -89,6 +85,7 @@ async fn main() -> std::io::Result<()> { .service(chat_completions) .service(chat_completions_local) .service(chat) + .service(embeddings_local) }) .bind((config.server.host.clone(), config.server.port))? .run() diff --git a/src/scripts/containers/bot.sh b/src/scripts/containers/bot.sh index f879522..da226d5 100644 --- a/src/scripts/containers/bot.sh +++ b/src/scripts/containers/bot.sh @@ -13,7 +13,6 @@ sleep 15 lxc exec "$PARAM_TENANT"-bot -- bash -c " - apt-get update && apt-get install -y \ build-essential cmake git pkg-config libjpeg-dev libtiff-dev \ libpng-dev libavcodec-dev libavformat-dev libswscale-dev \ @@ -111,4 +110,4 @@ sudo systemctl start bot.service lxc config device remove "$PARAM_TENANT"-bot bot-proxy 2>/dev/null || true lxc config device add "$PARAM_TENANT"-bot bot-proxy proxy \ listen=tcp:0.0.0.0:"$PARAM_BOT_PORT" \ - connect=tcp:127.0.0.1:"$PARAM_BOT_PORT" \ No newline at end of file + connect=tcp:127.0.0.1:"$PARAM_BOT_PORT" diff --git a/src/scripts/containers/drive.sh b/src/scripts/containers/drive.sh index ca7b476..5bd7f45 100644 --- a/src/scripts/containers/drive.sh +++ b/src/scripts/containers/drive.sh @@ -17,6 +17,9 @@ apt-get update && apt-get install -y wget wget https://dl.min.io/server/minio/release/linux-amd64/minio -O /usr/local/bin/minio chmod +x /usr/local/bin/minio +wget https://dl.min.io/client/mc/release/linux-amd64/mc +chmod +x /usr/local/bin/mc + useradd -r -s /bin/false minio-user || true mkdir -p /var/log/minio /data chown -R minio-user:minio-user /var/log/minio /data @@ -53,4 +56,4 @@ lxc config device add "${PARAM_TENANT}-drive" minio-proxy proxy \ lxc config device remove "${PARAM_TENANT}-drive" console-proxy 2>/dev/null || true lxc config device add "${PARAM_TENANT}-drive" console-proxy proxy \ listen=tcp:0.0.0.0:"${PARAM_DRIVE_PORT}" \ - connect=tcp:127.0.0.1:"${PARAM_DRIVE_PORT}" \ No newline at end of file + connect=tcp:127.0.0.1:"${PARAM_DRIVE_PORT}" diff --git a/src/scripts/containers/system.sh b/src/scripts/containers/system.sh index 6c2d724..94da09c 100644 --- a/src/scripts/containers/system.sh +++ b/src/scripts/containers/system.sh @@ -17,12 +17,25 @@ sleep 15 lxc exec $CONTAINER_NAME -- bash -c ' -apt-get update && apt-get install -y wget +apt-get update && apt-get install -y wget unzip + useradd -r -s /bin/false gbuser || true mkdir -p /opt/gbo/logs /opt/gbo/bin /opt/gbo/data /opt/gbo/conf chown -R gbuser:gbuser /opt/gbo/ +wget https://github.com/ggml-org/llama.cpp/releases/download/b6148/llama-b6148-bin-ubuntu-x64.zip +mkdir llama.cpp +mv llama-b6148-bin-ubuntu-x64.zip llama.cpp +cd llama.cpp +unzip llama-b6148-bin-ubuntu-x64.zip +mv build/bin/* . +rm build/bin -r +rm llama-b6148-bin-ubuntu-x64.zip + +wget https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf +wget https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-f32.gguf + cat > /etc/systemd/system/system.service < /etc/apt/trusted.gpg.d/postgresql.gpg -apt-get install -y postgresql-14 postgresql-client-14 -if ! id postgres &>/dev/null; then +sudo apt install -y postgresql-common +sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh +apt install -y postgresql - exit 1 +systemctl stop postgresql + +mkdir -p /etc/systemd/system/postgresql.service.d/ +cat > /etc/systemd/system/postgresql.service.d/override.conf <> \"\$HBA_FILE\" + fi fi -systemctl stop postgresql@14-main 2>/dev/null || systemctl stop postgresql 2>/dev/null || true + +systemctl daemon-reload +systemctl start postgresql +systemctl enable postgresql + +until sudo -u postgres psql -p $PARAM_TABLES_PORT -c '\q' 2>/dev/null; do + echo \"Waiting for PostgreSQL to start on port $PARAM_TABLES_PORT...\" + sleep 3 +done + +sudo -u postgres psql -p $PARAM_TABLES_PORT -c \"CREATE USER $PARAM_TENANT WITH PASSWORD '$PARAM_TABLES_PASSWORD';\" +sudo -u postgres psql -p $PARAM_TABLES_PORT -c \"CREATE DATABASE ${PARAM_TENANT}_db OWNER $PARAM_TENANT;\" +sudo -u postgres psql -p $PARAM_TABLES_PORT -c \"GRANT ALL PRIVILEGES ON DATABASE ${PARAM_TENANT}_db TO $PARAM_TENANT;\" + +systemctl restart postgresql " -POSTGRES_UID=$(lxc exec "$PARAM_TENANT"-tables -- id -u postgres) -POSTGRES_GID=$(lxc exec "$PARAM_TENANT"-tables -- id -g postgres) +lxc exec "$PARAM_TENANT"-tables -- systemctl stop postgresql -HOST_POSTGRES_UID=$((100000 + POSTGRES_UID)) -HOST_POSTGRES_GID=$((100000 + POSTGRES_GID)) +PG_DATA_DIR=$(lxc exec "$PARAM_TENANT"-tables -- bash -c "find /var/lib/postgresql -name main -type d | head -1") +PG_CONF_DIR=$(lxc exec "$PARAM_TENANT"-tables -- bash -c "find /etc/postgresql -name main -type d | head -1") +PG_LOGS_DIR=$(lxc exec "$PARAM_TENANT"-tables -- bash -c "find /var/log/postgresql -name postgresql-*.log -o -name postgresql.log | head -1 | xargs dirname 2>/dev/null || echo /var/log/postgresql") -chown -R "$HOST_POSTGRES_UID:$HOST_POSTGRES_GID" "$HOST_BASE" -chmod -R 750 "$HOST_BASE" +lxc config device add "$PARAM_TENANT"-tables pgdata disk source="$HOST_DATA" path="$PG_DATA_DIR" +lxc config device add "$PARAM_TENANT"-tables pgconf disk source="$HOST_CONF" path="$PG_CONF_DIR" +lxc config device add "$PARAM_TENANT"-tables pglogs disk source="$HOST_LOGS" path="$PG_LOGS_DIR" -lxc config device add "$PARAM_TENANT"-tables pgdata disk source="$HOST_DATA" path=/var/lib/postgresql/14/main -lxc config device add "$PARAM_TENANT"-tables pgconf disk source="$HOST_CONF" path=/etc/postgresql/14/main -lxc config device add "$PARAM_TENANT"-tables pglogs disk source="$HOST_LOGS" path=/var/log/postgresql +lxc exec "$PARAM_TENANT"-tables -- chown -R postgres:postgres "$PG_DATA_DIR" +lxc exec "$PARAM_TENANT"-tables -- chown -R postgres:postgres "$PG_CONF_DIR" +lxc exec "$PARAM_TENANT"-tables -- chown -R postgres:postgres "$PG_LOGS_DIR" -mkdir -p /var/lib/postgresql/14/main -mkdir -p /etc/postgresql/14/main -mkdir -p /var/log/postgresql -chown -R postgres:postgres /var/lib/postgresql/14/main -chown -R postgres:postgres /etc/postgresql/14/main -chown -R postgres:postgres /var/log/postgresql -chmod 700 /var/lib/postgresql/14/main - -sudo -u postgres /usr/lib/postgresql/14/bin/initdb -D /var/lib/postgresql/14/main - -cat > /etc/postgresql/14/main/postgresql.conf < /etc/postgresql/14/main/pg_hba.conf </dev/null || true lxc config device add "$PARAM_TENANT"-tables postgres-proxy proxy \ listen=tcp:0.0.0.0:"$PARAM_TABLES_PORT" \ connect=tcp:127.0.0.1:"$PARAM_TABLES_PORT" -cd /var/lib/postgresql -until sudo -u postgres psql -p $PARAM_TABLES_PORT -c '\q' 2>/dev/null; do - -sleep 3 -sudo -u "$PARAM_TABLES_USER" psql -p $PARAM_TABLES_PORT -c \"CREATE USER $PARAM_TENANT WITH PASSWORD '$PARAM_TABLES_PASSWORD';\" 2>/dev/null -sudo -u "$PARAM_TABLES_USER" psql -p $PARAM_TABLES_PORT -c \"CREATE DATABASE ${PARAM_TENANT}_db OWNER $PARAM_TENANT;\" 2>/dev/null -sudo -u "$PARAM_TABLES_USER" psql -p $PARAM_TABLES_PORT -c \"GRANT ALL PRIVILEGES ON DATABASE ${PARAM_TENANT}_db TO $PARAM_TENANT;\" 2>/dev/null +echo "PostgreSQL setup completed successfully!" +echo "Database: ${PARAM_TENANT}_db" +echo "User: $PARAM_TENANT" +echo "Password: $PARAM_TABLES_PASSWORD" +echo "Port: $PARAM_TABLES_PORT" diff --git a/src/services/llm_local.rs b/src/services/llm_local.rs index bef0973..8f56d13 100644 --- a/src/services/llm_local.rs +++ b/src/services/llm_local.rs @@ -59,157 +59,170 @@ struct LlamaCppResponse { generation_settings: Option, } -// Function to check if server is running -async fn is_server_running(url: &str) -> bool { - let client = Client::builder() - .timeout(Duration::from_secs(3)) - .build() - .unwrap(); +pub async fn ensure_llama_servers_running() -> Result<(), Box> +{ + let llm_local = env::var("LLM_LOCAL").unwrap_or_else(|_| "false".to_string()); - match client.get(&format!("{}/health", url)).send().await { - Ok(response) => { - let is_ok = response.status().is_success(); - if is_ok { - println!("๐ŸŸข Server health check: OK"); - } else { - println!( - "๐Ÿ”ด Server health check: Failed with status {}", - response.status() - ); - } - is_ok - } - Err(e) => { - println!("๐Ÿ”ด Server health check: Connection failed - {}", e); - false - } - } -} - -// Function to start llama.cpp server -async fn start_llama_server() -> Result<(), Box> { - println!("๐Ÿš€ Starting llama.cpp server..."); - - // Get environment variables for llama.cpp configuration - let llama_path = env::var("LLM_CPP_PATH").unwrap_or_else(|_| "llama-server".to_string()); - let model_path = env::var("LLM_MODEL_PATH") - .unwrap_or_else(|_| "./tinyllama-1.1b-chat-v1.0.Q4_0.gguf".to_string()); - let cpu_limit = env::var("CPU_LIMIT").unwrap_or_else(|_| "50".to_string()); - let port = env::var("LLM_PORT").unwrap_or_else(|_| "8080".to_string()); - - println!("๐Ÿ”ง Configuration:"); - println!(" - Llama path: {}", llama_path); - println!(" - Model path: {}", model_path); - println!(" - CPU limit: {}%", cpu_limit); - println!(" - Port: {}", port); - - // Kill any existing llama processes - println!("๐Ÿงน Cleaning up existing processes..."); - let _ = Command::new("pkill").arg("-f").arg("llama-cli").output(); - - // Wait a bit for cleanup - sleep(Duration::from_secs(2)).await; - - // Build the command - let full_command = format!( - "{}/llama-server -m {} --mlock --port {} --host 127.0.0.1", - llama_path, model_path, port - ); - - println!("๐Ÿ“ Executing command: {}", full_command); - - // Start llama.cpp server with cpulimit using tokio - let mut cmd = TokioCommand::new("sh"); - cmd.arg("-c"); - cmd.arg(&full_command); - cmd.stdout(Stdio::piped()); - cmd.stderr(Stdio::piped()); - cmd.kill_on_drop(true); - - let mut child = cmd - .spawn() - .map_err(|e| format!("Failed to start llama.cpp server: {}", e))?; - - println!("๐Ÿ”„ Process spawned with PID: {:?}", child.id()); - - // Capture stdout and stderr for real-time logging - if let Some(stdout) = child.stdout.take() { - let stdout_reader = BufReader::new(stdout); - tokio::spawn(async move { - let mut lines = stdout_reader.lines(); - while let Ok(Some(line)) = lines.next_line().await { - println!("๐Ÿฆ™๐Ÿ“ค STDOUT: {}", line); - } - println!("๐Ÿฆ™๐Ÿ“ค STDOUT stream ended"); - }); - } - - if let Some(stderr) = child.stderr.take() { - let stderr_reader = BufReader::new(stderr); - tokio::spawn(async move { - let mut lines = stderr_reader.lines(); - while let Ok(Some(line)) = lines.next_line().await { - println!("๐Ÿฆ™๐Ÿ“ฅ STDERR: {}", line); - } - println!("๐Ÿฆ™๐Ÿ“ฅ STDERR stream ended"); - }); - } - - // Store the process handle - unsafe { - LLAMA_PROCESS = Some(Arc::new(Mutex::new(Some(child)))); - } - - println!("โœ… Llama.cpp server process started!"); - Ok(()) -} - -// Function to ensure llama.cpp server is running -pub async fn ensure_llama_server_running() -> Result<(), Box> { - let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8080".to_string()); - - // Check if server is already running - if is_server_running(&llama_url).await { - println!("โœ… Llama.cpp server is already running"); + if llm_local.to_lowercase() != "true" { + println!("โ„น๏ธ LLM_LOCAL is not enabled, skipping local server startup"); return Ok(()); } - // Start the server - start_llama_server().await?; + // Get configuration from environment variables + let llm_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string()); + let embedding_url = + env::var("EMBEDDING_URL").unwrap_or_else(|_| "http://localhost:8082".to_string()); + let llama_cpp_path = env::var("LLM_CPP_PATH").unwrap_or_else(|_| "~/llama.cpp".to_string()); + let llm_model_path = env::var("LLM_MODEL_PATH").unwrap_or_else(|_| "".to_string()); + let embedding_model_path = env::var("EMBEDDING_MODEL_PATH").unwrap_or_else(|_| "".to_string()); + + println!("๐Ÿš€ Starting local llama.cpp servers..."); + println!("๐Ÿ“‹ Configuration:"); + println!(" LLM URL: {}", llm_url); + println!(" Embedding URL: {}", embedding_url); + println!(" LLM Model: {}", llm_model_path); + println!(" Embedding Model: {}", embedding_model_path); + + // Check if servers are already running + let llm_running = is_server_running(&llm_url).await; + let embedding_running = is_server_running(&embedding_url).await; + + if llm_running && embedding_running { + println!("โœ… Both LLM and Embedding servers are already running"); + return Ok(()); + } + + // Start servers that aren't running + let mut tasks = vec![]; + + if !llm_running && !llm_model_path.is_empty() { + println!("๐Ÿ”„ Starting LLM server..."); + tasks.push(tokio::spawn(start_llm_server( + llama_cpp_path.clone(), + llm_model_path.clone(), + llm_url.clone(), + ))); + } else if llm_model_path.is_empty() { + println!("โš ๏ธ LLM_MODEL_PATH not set, skipping LLM server"); + } + + if !embedding_running && !embedding_model_path.is_empty() { + println!("๐Ÿ”„ Starting Embedding server..."); + tasks.push(tokio::spawn(start_embedding_server( + llama_cpp_path.clone(), + embedding_model_path.clone(), + embedding_url.clone(), + ))); + } else if embedding_model_path.is_empty() { + println!("โš ๏ธ EMBEDDING_MODEL_PATH not set, skipping Embedding server"); + } + + // Wait for all server startup tasks + for task in tasks { + task.await??; + } + + // Wait for servers to be ready with verbose logging + println!("โณ Waiting for servers to become ready..."); + + let mut llm_ready = llm_running || llm_model_path.is_empty(); + let mut embedding_ready = embedding_running || embedding_model_path.is_empty(); - // Wait for server to be ready with verbose logging - println!("โณ Waiting for llama.cpp server to become ready..."); let mut attempts = 0; let max_attempts = 60; // 2 minutes total - while attempts < max_attempts { + while attempts < max_attempts && (!llm_ready || !embedding_ready) { sleep(Duration::from_secs(2)).await; - print!( - "๐Ÿ” Checking server health (attempt {}/{})... ", + println!( + "๐Ÿ” Checking server health (attempt {}/{})...", attempts + 1, max_attempts ); - if is_server_running(&llama_url).await { - println!("โœ… SUCCESS!"); - println!("๐ŸŽ‰ Llama.cpp server is ready and responding!"); - return Ok(()); - } else { - println!("โŒ Not ready yet"); + if !llm_ready && !llm_model_path.is_empty() { + if is_server_running(&llm_url).await { + println!(" โœ… LLM server ready at {}", llm_url); + llm_ready = true; + } else { + println!(" โŒ LLM server not ready yet"); + } + } + + if !embedding_ready && !embedding_model_path.is_empty() { + if is_server_running(&embedding_url).await { + println!(" โœ… Embedding server ready at {}", embedding_url); + embedding_ready = true; + } else { + println!(" โŒ Embedding server not ready yet"); + } } attempts += 1; + if attempts % 10 == 0 { println!( - "โฐ Still waiting for llama.cpp server... (attempt {}/{})", + "โฐ Still waiting for servers... (attempt {}/{})", attempts, max_attempts ); - println!("๐Ÿ’ก Check the logs above for any errors from the llama server"); } } - Err("โŒ Llama.cpp server failed to start within timeout (2 minutes)".into()) + if llm_ready && embedding_ready { + println!("๐ŸŽ‰ All llama.cpp servers are ready and responding!"); + Ok(()) + } else { + let mut error_msg = "โŒ Servers failed to start within timeout:".to_string(); + if !llm_ready && !llm_model_path.is_empty() { + error_msg.push_str(&format!("\n - LLM server at {}", llm_url)); + } + if !embedding_ready && !embedding_model_path.is_empty() { + error_msg.push_str(&format!("\n - Embedding server at {}", embedding_url)); + } + Err(error_msg.into()) + } +} + +async fn start_llm_server( + llama_cpp_path: String, + model_path: String, + url: String, +) -> Result<(), Box> { + let port = url.split(':').last().unwrap_or("8081"); + + let mut cmd = tokio::process::Command::new("sh"); + cmd.arg("-c").arg(format!( + "cd {} && ./llama-server -m {} --host 0.0.0.0 --port {} --n-gpu-layers 99 &", + llama_cpp_path, model_path, port + )); + + cmd.spawn()?; + Ok(()) +} + +async fn start_embedding_server( + llama_cpp_path: String, + model_path: String, + url: String, +) -> Result<(), Box> { + let port = url.split(':').last().unwrap_or("8082"); + + let mut cmd = tokio::process::Command::new("sh"); + cmd.arg("-c").arg(format!( + "cd {} && ./llama-server -m {} --host 0.0.0.0 --port {} --embedding --n-gpu-layers 99 &", + llama_cpp_path, model_path, port + )); + + cmd.spawn()?; + Ok(()) +} + +async fn is_server_running(url: &str) -> bool { + let client = reqwest::Client::new(); + match client.get(&format!("{}/health", url)).send().await { + Ok(response) => response.status().is_success(), + Err(_) => false, + } } // Convert OpenAI chat messages to a single prompt @@ -238,26 +251,15 @@ fn messages_to_prompt(messages: &[ChatMessage]) -> String { } // Proxy endpoint -#[post("/v1/chat/completions1")] +#[post("/v1/chat/completions")] pub async fn chat_completions_local( req_body: web::Json, _req: HttpRequest, ) -> Result { dotenv().ok().unwrap(); - // Ensure llama.cpp server is running - if let Err(e) = ensure_llama_server_running().await { - eprintln!("Failed to start llama.cpp server: {}", e); - return Ok(HttpResponse::InternalServerError().json(serde_json::json!({ - "error": { - "message": format!("Failed to start llama.cpp server: {}", e), - "type": "server_error" - } - }))); - } - // Get llama.cpp server URL - let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8080".to_string()); + let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string()); // Convert OpenAI format to llama.cpp format let prompt = messages_to_prompt(&req_body.messages); @@ -342,10 +344,147 @@ pub async fn chat_completions_local( } } +// OpenAI Embedding Request +#[derive(Debug, Deserialize)] +pub struct EmbeddingRequest { + pub input: Vec, + pub model: String, + #[serde(default)] + pub encoding_format: Option, +} + +// OpenAI Embedding Response +#[derive(Debug, Serialize)] +pub struct EmbeddingResponse { + pub object: String, + pub data: Vec, + pub model: String, + pub usage: Usage, +} + +#[derive(Debug, Serialize)] +pub struct EmbeddingData { + pub object: String, + pub embedding: Vec, + pub index: usize, +} + +#[derive(Debug, Serialize)] +pub struct Usage { + pub prompt_tokens: u32, + pub total_tokens: u32, +} + +// Llama.cpp Embedding Request +#[derive(Debug, Serialize)] +struct LlamaCppEmbeddingRequest { + pub content: String, +} + +// Llama.cpp Embedding Response +#[derive(Debug, Deserialize)] +struct LlamaCppEmbeddingResponse { + pub embedding: Vec, +} + +// Proxy endpoint for embeddings +#[post("/v1/embeddings")] +pub async fn embeddings_local( + req_body: web::Json, + _req: HttpRequest, +) -> Result { + dotenv().ok(); + + // Get llama.cpp server URL + let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8082".to_string()); + + let client = Client::builder() + .timeout(Duration::from_secs(120)) + .build() + .map_err(|e| { + eprintln!("Error creating HTTP client: {}", e); + actix_web::error::ErrorInternalServerError("Failed to create HTTP client") + })?; + + // Process each input text and get embeddings + let mut embeddings_data = Vec::new(); + let mut total_tokens = 0; + + for (index, input_text) in req_body.input.iter().enumerate() { + let llama_request = LlamaCppEmbeddingRequest { + content: input_text.clone(), + }; + + let response = client + .post(&format!("{}/embedding", llama_url)) + .header("Content-Type", "application/json") + .json(&llama_request) + .send() + .await + .map_err(|e| { + eprintln!("Error calling llama.cpp server for embedding: {}", e); + actix_web::error::ErrorInternalServerError( + "Failed to call llama.cpp server for embedding", + ) + })?; + + let status = response.status(); + + if status.is_success() { + let llama_response: LlamaCppEmbeddingResponse = response.json().await.map_err(|e| { + eprintln!("Error parsing llama.cpp embedding response: {}", e); + actix_web::error::ErrorInternalServerError( + "Failed to parse llama.cpp embedding response", + ) + })?; + + // Estimate token count (this is approximate since llama.cpp doesn't return token count for embeddings) + let estimated_tokens = (input_text.len() as f32 / 4.0).ceil() as u32; + total_tokens += estimated_tokens; + + embeddings_data.push(EmbeddingData { + object: "embedding".to_string(), + embedding: llama_response.embedding, + index, + }); + } else { + let error_text = response + .text() + .await + .unwrap_or_else(|_| "Unknown error".to_string()); + + eprintln!("Llama.cpp server error ({}): {}", status, error_text); + + let actix_status = actix_web::http::StatusCode::from_u16(status.as_u16()) + .unwrap_or(actix_web::http::StatusCode::INTERNAL_SERVER_ERROR); + + return Ok(HttpResponse::build(actix_status).json(serde_json::json!({ + "error": { + "message": format!("Failed to get embedding for input {}: {}", index, error_text), + "type": "server_error" + } + }))); + } + } + + // Build OpenAI-compatible response + let openai_response = EmbeddingResponse { + object: "list".to_string(), + data: embeddings_data, + model: req_body.model.clone(), + usage: Usage { + prompt_tokens: total_tokens, + total_tokens, + }, + }; + + Ok(HttpResponse::Ok().json(openai_response)) +} + // Health check endpoint #[actix_web::get("/health")] pub async fn health() -> Result { - let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8080".to_string()); + let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string()); if is_server_running(&llama_url).await { Ok(HttpResponse::Ok().json(serde_json::json!({