2025-08-14 09:42:05 -03:00
|
|
|
|
use actix_web::{post, web, HttpRequest, HttpResponse, Result};
|
|
|
|
|
use dotenv::dotenv;
|
|
|
|
|
use reqwest::Client;
|
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
use std::env;
|
2025-08-16 18:13:03 -03:00
|
|
|
|
use std::process::{Command, Stdio};
|
2025-08-14 09:42:05 -03:00
|
|
|
|
use std::sync::{Arc, Mutex};
|
|
|
|
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
|
|
|
|
use tokio::process::Command as TokioCommand;
|
|
|
|
|
use tokio::time::{sleep, Duration};
|
|
|
|
|
|
|
|
|
|
// Global process handle
|
|
|
|
|
static mut LLAMA_PROCESS: Option<Arc<Mutex<Option<tokio::process::Child>>>> = None;
|
|
|
|
|
|
|
|
|
|
// OpenAI-compatible request/response structures
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
struct ChatMessage {
|
|
|
|
|
role: String,
|
|
|
|
|
content: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
struct ChatCompletionRequest {
|
|
|
|
|
model: String,
|
|
|
|
|
messages: Vec<ChatMessage>,
|
|
|
|
|
stream: Option<bool>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
struct ChatCompletionResponse {
|
|
|
|
|
id: String,
|
|
|
|
|
object: String,
|
|
|
|
|
created: u64,
|
|
|
|
|
model: String,
|
|
|
|
|
choices: Vec<Choice>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
struct Choice {
|
|
|
|
|
message: ChatMessage,
|
|
|
|
|
finish_reason: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Llama.cpp server request/response structures
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
struct LlamaCppRequest {
|
|
|
|
|
prompt: String,
|
|
|
|
|
n_predict: Option<i32>,
|
|
|
|
|
temperature: Option<f32>,
|
|
|
|
|
top_k: Option<i32>,
|
|
|
|
|
top_p: Option<f32>,
|
|
|
|
|
stream: Option<bool>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
struct LlamaCppResponse {
|
|
|
|
|
content: String,
|
|
|
|
|
stop: bool,
|
|
|
|
|
generation_settings: Option<serde_json::Value>,
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
pub async fn ensure_llama_servers_running() -> Result<(), Box<dyn std::error::Error + Send + Sync>>
|
|
|
|
|
{
|
|
|
|
|
let llm_local = env::var("LLM_LOCAL").unwrap_or_else(|_| "false".to_string());
|
2025-08-14 09:42:05 -03:00
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
if llm_local.to_lowercase() != "true" {
|
|
|
|
|
println!("ℹ️ LLM_LOCAL is not enabled, skipping local server startup");
|
|
|
|
|
return Ok(());
|
2025-08-14 09:42:05 -03:00
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
// Get configuration from environment variables
|
|
|
|
|
let llm_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string());
|
|
|
|
|
let embedding_url =
|
|
|
|
|
env::var("EMBEDDING_URL").unwrap_or_else(|_| "http://localhost:8082".to_string());
|
|
|
|
|
let llama_cpp_path = env::var("LLM_CPP_PATH").unwrap_or_else(|_| "~/llama.cpp".to_string());
|
|
|
|
|
let llm_model_path = env::var("LLM_MODEL_PATH").unwrap_or_else(|_| "".to_string());
|
|
|
|
|
let embedding_model_path = env::var("EMBEDDING_MODEL_PATH").unwrap_or_else(|_| "".to_string());
|
|
|
|
|
|
|
|
|
|
println!("🚀 Starting local llama.cpp servers...");
|
|
|
|
|
println!("📋 Configuration:");
|
|
|
|
|
println!(" LLM URL: {}", llm_url);
|
|
|
|
|
println!(" Embedding URL: {}", embedding_url);
|
|
|
|
|
println!(" LLM Model: {}", llm_model_path);
|
|
|
|
|
println!(" Embedding Model: {}", embedding_model_path);
|
|
|
|
|
|
|
|
|
|
// Check if servers are already running
|
|
|
|
|
let llm_running = is_server_running(&llm_url).await;
|
|
|
|
|
let embedding_running = is_server_running(&embedding_url).await;
|
|
|
|
|
|
|
|
|
|
if llm_running && embedding_running {
|
|
|
|
|
println!("✅ Both LLM and Embedding servers are already running");
|
|
|
|
|
return Ok(());
|
2025-08-14 09:42:05 -03:00
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
// Start servers that aren't running
|
|
|
|
|
let mut tasks = vec![];
|
|
|
|
|
|
|
|
|
|
if !llm_running && !llm_model_path.is_empty() {
|
|
|
|
|
println!("🔄 Starting LLM server...");
|
|
|
|
|
tasks.push(tokio::spawn(start_llm_server(
|
|
|
|
|
llama_cpp_path.clone(),
|
|
|
|
|
llm_model_path.clone(),
|
|
|
|
|
llm_url.clone(),
|
|
|
|
|
)));
|
|
|
|
|
} else if llm_model_path.is_empty() {
|
|
|
|
|
println!("⚠️ LLM_MODEL_PATH not set, skipping LLM server");
|
2025-08-14 09:42:05 -03:00
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
if !embedding_running && !embedding_model_path.is_empty() {
|
|
|
|
|
println!("🔄 Starting Embedding server...");
|
|
|
|
|
tasks.push(tokio::spawn(start_embedding_server(
|
|
|
|
|
llama_cpp_path.clone(),
|
|
|
|
|
embedding_model_path.clone(),
|
|
|
|
|
embedding_url.clone(),
|
|
|
|
|
)));
|
|
|
|
|
} else if embedding_model_path.is_empty() {
|
|
|
|
|
println!("⚠️ EMBEDDING_MODEL_PATH not set, skipping Embedding server");
|
2025-08-14 09:42:05 -03:00
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
// Wait for all server startup tasks
|
|
|
|
|
for task in tasks {
|
|
|
|
|
task.await??;
|
2025-08-14 09:42:05 -03:00
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
// Wait for servers to be ready with verbose logging
|
|
|
|
|
println!("⏳ Waiting for servers to become ready...");
|
|
|
|
|
|
|
|
|
|
let mut llm_ready = llm_running || llm_model_path.is_empty();
|
|
|
|
|
let mut embedding_ready = embedding_running || embedding_model_path.is_empty();
|
2025-08-14 09:42:05 -03:00
|
|
|
|
|
|
|
|
|
let mut attempts = 0;
|
|
|
|
|
let max_attempts = 60; // 2 minutes total
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
while attempts < max_attempts && (!llm_ready || !embedding_ready) {
|
2025-08-14 09:42:05 -03:00
|
|
|
|
sleep(Duration::from_secs(2)).await;
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
println!(
|
|
|
|
|
"🔍 Checking server health (attempt {}/{})...",
|
2025-08-14 09:42:05 -03:00
|
|
|
|
attempts + 1,
|
|
|
|
|
max_attempts
|
|
|
|
|
);
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
if !llm_ready && !llm_model_path.is_empty() {
|
|
|
|
|
if is_server_running(&llm_url).await {
|
|
|
|
|
println!(" ✅ LLM server ready at {}", llm_url);
|
|
|
|
|
llm_ready = true;
|
|
|
|
|
} else {
|
|
|
|
|
println!(" ❌ LLM server not ready yet");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !embedding_ready && !embedding_model_path.is_empty() {
|
|
|
|
|
if is_server_running(&embedding_url).await {
|
|
|
|
|
println!(" ✅ Embedding server ready at {}", embedding_url);
|
|
|
|
|
embedding_ready = true;
|
|
|
|
|
} else {
|
|
|
|
|
println!(" ❌ Embedding server not ready yet");
|
|
|
|
|
}
|
2025-08-14 09:42:05 -03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
attempts += 1;
|
2025-09-08 14:58:22 -03:00
|
|
|
|
|
2025-08-14 09:42:05 -03:00
|
|
|
|
if attempts % 10 == 0 {
|
|
|
|
|
println!(
|
2025-09-08 14:58:22 -03:00
|
|
|
|
"⏰ Still waiting for servers... (attempt {}/{})",
|
2025-08-14 09:42:05 -03:00
|
|
|
|
attempts, max_attempts
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
if llm_ready && embedding_ready {
|
|
|
|
|
println!("🎉 All llama.cpp servers are ready and responding!");
|
|
|
|
|
Ok(())
|
|
|
|
|
} else {
|
|
|
|
|
let mut error_msg = "❌ Servers failed to start within timeout:".to_string();
|
|
|
|
|
if !llm_ready && !llm_model_path.is_empty() {
|
|
|
|
|
error_msg.push_str(&format!("\n - LLM server at {}", llm_url));
|
|
|
|
|
}
|
|
|
|
|
if !embedding_ready && !embedding_model_path.is_empty() {
|
|
|
|
|
error_msg.push_str(&format!("\n - Embedding server at {}", embedding_url));
|
|
|
|
|
}
|
|
|
|
|
Err(error_msg.into())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn start_llm_server(
|
|
|
|
|
llama_cpp_path: String,
|
|
|
|
|
model_path: String,
|
|
|
|
|
url: String,
|
|
|
|
|
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
|
|
|
let port = url.split(':').last().unwrap_or("8081");
|
|
|
|
|
|
|
|
|
|
let mut cmd = tokio::process::Command::new("sh");
|
|
|
|
|
cmd.arg("-c").arg(format!(
|
|
|
|
|
"cd {} && ./llama-server -m {} --host 0.0.0.0 --port {} --n-gpu-layers 99 &",
|
|
|
|
|
llama_cpp_path, model_path, port
|
|
|
|
|
));
|
|
|
|
|
|
|
|
|
|
cmd.spawn()?;
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn start_embedding_server(
|
|
|
|
|
llama_cpp_path: String,
|
|
|
|
|
model_path: String,
|
|
|
|
|
url: String,
|
|
|
|
|
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|
|
|
|
let port = url.split(':').last().unwrap_or("8082");
|
|
|
|
|
|
|
|
|
|
let mut cmd = tokio::process::Command::new("sh");
|
|
|
|
|
cmd.arg("-c").arg(format!(
|
|
|
|
|
"cd {} && ./llama-server -m {} --host 0.0.0.0 --port {} --embedding --n-gpu-layers 99 &",
|
|
|
|
|
llama_cpp_path, model_path, port
|
|
|
|
|
));
|
|
|
|
|
|
|
|
|
|
cmd.spawn()?;
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn is_server_running(url: &str) -> bool {
|
|
|
|
|
let client = reqwest::Client::new();
|
|
|
|
|
match client.get(&format!("{}/health", url)).send().await {
|
|
|
|
|
Ok(response) => response.status().is_success(),
|
|
|
|
|
Err(_) => false,
|
|
|
|
|
}
|
2025-08-14 09:42:05 -03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Convert OpenAI chat messages to a single prompt
|
|
|
|
|
fn messages_to_prompt(messages: &[ChatMessage]) -> String {
|
|
|
|
|
let mut prompt = String::new();
|
|
|
|
|
|
|
|
|
|
for message in messages {
|
|
|
|
|
match message.role.as_str() {
|
|
|
|
|
"system" => {
|
|
|
|
|
prompt.push_str(&format!("System: {}\n\n", message.content));
|
|
|
|
|
}
|
|
|
|
|
"user" => {
|
|
|
|
|
prompt.push_str(&format!("User: {}\n\n", message.content));
|
|
|
|
|
}
|
|
|
|
|
"assistant" => {
|
|
|
|
|
prompt.push_str(&format!("Assistant: {}\n\n", message.content));
|
|
|
|
|
}
|
|
|
|
|
_ => {
|
|
|
|
|
prompt.push_str(&format!("{}: {}\n\n", message.role, message.content));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prompt.push_str("Assistant: ");
|
|
|
|
|
prompt
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Proxy endpoint
|
2025-09-08 14:58:22 -03:00
|
|
|
|
#[post("/v1/chat/completions")]
|
2025-08-17 14:43:35 -03:00
|
|
|
|
pub async fn chat_completions_local(
|
2025-08-14 09:42:05 -03:00
|
|
|
|
req_body: web::Json<ChatCompletionRequest>,
|
|
|
|
|
_req: HttpRequest,
|
|
|
|
|
) -> Result<HttpResponse> {
|
2025-08-17 14:43:35 -03:00
|
|
|
|
dotenv().ok().unwrap();
|
2025-08-14 09:42:05 -03:00
|
|
|
|
|
|
|
|
|
// Get llama.cpp server URL
|
2025-09-08 14:58:22 -03:00
|
|
|
|
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string());
|
2025-08-14 09:42:05 -03:00
|
|
|
|
|
|
|
|
|
// Convert OpenAI format to llama.cpp format
|
|
|
|
|
let prompt = messages_to_prompt(&req_body.messages);
|
|
|
|
|
|
|
|
|
|
let llama_request = LlamaCppRequest {
|
|
|
|
|
prompt,
|
|
|
|
|
n_predict: Some(500), // Adjust as needed
|
|
|
|
|
temperature: Some(0.7),
|
|
|
|
|
top_k: Some(40),
|
|
|
|
|
top_p: Some(0.9),
|
|
|
|
|
stream: req_body.stream,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Send request to llama.cpp server
|
|
|
|
|
let client = Client::builder()
|
|
|
|
|
.timeout(Duration::from_secs(120)) // 2 minute timeout
|
|
|
|
|
.build()
|
|
|
|
|
.map_err(|e| {
|
|
|
|
|
eprintln!("Error creating HTTP client: {}", e);
|
|
|
|
|
actix_web::error::ErrorInternalServerError("Failed to create HTTP client")
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let response = client
|
|
|
|
|
.post(&format!("{}/completion", llama_url))
|
|
|
|
|
.header("Content-Type", "application/json")
|
|
|
|
|
.json(&llama_request)
|
|
|
|
|
.send()
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| {
|
|
|
|
|
eprintln!("Error calling llama.cpp server: {}", e);
|
|
|
|
|
actix_web::error::ErrorInternalServerError("Failed to call llama.cpp server")
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let status = response.status();
|
|
|
|
|
|
|
|
|
|
if status.is_success() {
|
|
|
|
|
let llama_response: LlamaCppResponse = response.json().await.map_err(|e| {
|
|
|
|
|
eprintln!("Error parsing llama.cpp response: {}", e);
|
|
|
|
|
actix_web::error::ErrorInternalServerError("Failed to parse llama.cpp response")
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
// Convert llama.cpp response to OpenAI format
|
|
|
|
|
let openai_response = ChatCompletionResponse {
|
|
|
|
|
id: format!("chatcmpl-{}", uuid::Uuid::new_v4()),
|
|
|
|
|
object: "chat.completion".to_string(),
|
|
|
|
|
created: std::time::SystemTime::now()
|
|
|
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.as_secs(),
|
|
|
|
|
model: req_body.model.clone(),
|
|
|
|
|
choices: vec![Choice {
|
|
|
|
|
message: ChatMessage {
|
|
|
|
|
role: "assistant".to_string(),
|
|
|
|
|
content: llama_response.content.trim().to_string(),
|
|
|
|
|
},
|
|
|
|
|
finish_reason: if llama_response.stop {
|
|
|
|
|
"stop".to_string()
|
|
|
|
|
} else {
|
|
|
|
|
"length".to_string()
|
|
|
|
|
},
|
|
|
|
|
}],
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(HttpResponse::Ok().json(openai_response))
|
|
|
|
|
} else {
|
|
|
|
|
let error_text = response
|
|
|
|
|
.text()
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_or_else(|_| "Unknown error".to_string());
|
|
|
|
|
|
|
|
|
|
eprintln!("Llama.cpp server error ({}): {}", status, error_text);
|
|
|
|
|
|
|
|
|
|
let actix_status = actix_web::http::StatusCode::from_u16(status.as_u16())
|
|
|
|
|
.unwrap_or(actix_web::http::StatusCode::INTERNAL_SERVER_ERROR);
|
|
|
|
|
|
|
|
|
|
Ok(HttpResponse::build(actix_status).json(serde_json::json!({
|
|
|
|
|
"error": {
|
|
|
|
|
"message": error_text,
|
|
|
|
|
"type": "server_error"
|
|
|
|
|
}
|
|
|
|
|
})))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-08 14:58:22 -03:00
|
|
|
|
// OpenAI Embedding Request
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
|
pub struct EmbeddingRequest {
|
|
|
|
|
pub input: Vec<String>,
|
|
|
|
|
pub model: String,
|
|
|
|
|
#[serde(default)]
|
|
|
|
|
pub encoding_format: Option<String>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// OpenAI Embedding Response
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct EmbeddingResponse {
|
|
|
|
|
pub object: String,
|
|
|
|
|
pub data: Vec<EmbeddingData>,
|
|
|
|
|
pub model: String,
|
|
|
|
|
pub usage: Usage,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct EmbeddingData {
|
|
|
|
|
pub object: String,
|
|
|
|
|
pub embedding: Vec<f32>,
|
|
|
|
|
pub index: usize,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct Usage {
|
|
|
|
|
pub prompt_tokens: u32,
|
|
|
|
|
pub total_tokens: u32,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Llama.cpp Embedding Request
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
struct LlamaCppEmbeddingRequest {
|
|
|
|
|
pub content: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Llama.cpp Embedding Response
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
|
struct LlamaCppEmbeddingResponse {
|
|
|
|
|
pub embedding: Vec<f32>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Proxy endpoint for embeddings
|
|
|
|
|
#[post("/v1/embeddings")]
|
|
|
|
|
pub async fn embeddings_local(
|
|
|
|
|
req_body: web::Json<EmbeddingRequest>,
|
|
|
|
|
_req: HttpRequest,
|
|
|
|
|
) -> Result<HttpResponse> {
|
|
|
|
|
dotenv().ok();
|
|
|
|
|
|
|
|
|
|
// Get llama.cpp server URL
|
|
|
|
|
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8082".to_string());
|
|
|
|
|
|
|
|
|
|
let client = Client::builder()
|
|
|
|
|
.timeout(Duration::from_secs(120))
|
|
|
|
|
.build()
|
|
|
|
|
.map_err(|e| {
|
|
|
|
|
eprintln!("Error creating HTTP client: {}", e);
|
|
|
|
|
actix_web::error::ErrorInternalServerError("Failed to create HTTP client")
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
// Process each input text and get embeddings
|
|
|
|
|
let mut embeddings_data = Vec::new();
|
|
|
|
|
let mut total_tokens = 0;
|
|
|
|
|
|
|
|
|
|
for (index, input_text) in req_body.input.iter().enumerate() {
|
|
|
|
|
let llama_request = LlamaCppEmbeddingRequest {
|
|
|
|
|
content: input_text.clone(),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let response = client
|
|
|
|
|
.post(&format!("{}/embedding", llama_url))
|
|
|
|
|
.header("Content-Type", "application/json")
|
|
|
|
|
.json(&llama_request)
|
|
|
|
|
.send()
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| {
|
|
|
|
|
eprintln!("Error calling llama.cpp server for embedding: {}", e);
|
|
|
|
|
actix_web::error::ErrorInternalServerError(
|
|
|
|
|
"Failed to call llama.cpp server for embedding",
|
|
|
|
|
)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let status = response.status();
|
|
|
|
|
|
|
|
|
|
if status.is_success() {
|
|
|
|
|
let llama_response: LlamaCppEmbeddingResponse = response.json().await.map_err(|e| {
|
|
|
|
|
eprintln!("Error parsing llama.cpp embedding response: {}", e);
|
|
|
|
|
actix_web::error::ErrorInternalServerError(
|
|
|
|
|
"Failed to parse llama.cpp embedding response",
|
|
|
|
|
)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
// Estimate token count (this is approximate since llama.cpp doesn't return token count for embeddings)
|
|
|
|
|
let estimated_tokens = (input_text.len() as f32 / 4.0).ceil() as u32;
|
|
|
|
|
total_tokens += estimated_tokens;
|
|
|
|
|
|
|
|
|
|
embeddings_data.push(EmbeddingData {
|
|
|
|
|
object: "embedding".to_string(),
|
|
|
|
|
embedding: llama_response.embedding,
|
|
|
|
|
index,
|
|
|
|
|
});
|
|
|
|
|
} else {
|
|
|
|
|
let error_text = response
|
|
|
|
|
.text()
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_or_else(|_| "Unknown error".to_string());
|
|
|
|
|
|
|
|
|
|
eprintln!("Llama.cpp server error ({}): {}", status, error_text);
|
|
|
|
|
|
|
|
|
|
let actix_status = actix_web::http::StatusCode::from_u16(status.as_u16())
|
|
|
|
|
.unwrap_or(actix_web::http::StatusCode::INTERNAL_SERVER_ERROR);
|
|
|
|
|
|
|
|
|
|
return Ok(HttpResponse::build(actix_status).json(serde_json::json!({
|
|
|
|
|
"error": {
|
|
|
|
|
"message": format!("Failed to get embedding for input {}: {}", index, error_text),
|
|
|
|
|
"type": "server_error"
|
|
|
|
|
}
|
|
|
|
|
})));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Build OpenAI-compatible response
|
|
|
|
|
let openai_response = EmbeddingResponse {
|
|
|
|
|
object: "list".to_string(),
|
|
|
|
|
data: embeddings_data,
|
|
|
|
|
model: req_body.model.clone(),
|
|
|
|
|
usage: Usage {
|
|
|
|
|
prompt_tokens: total_tokens,
|
|
|
|
|
total_tokens,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(HttpResponse::Ok().json(openai_response))
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-14 09:42:05 -03:00
|
|
|
|
// Health check endpoint
|
|
|
|
|
#[actix_web::get("/health")]
|
|
|
|
|
pub async fn health() -> Result<HttpResponse> {
|
2025-09-08 14:58:22 -03:00
|
|
|
|
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string());
|
2025-08-14 09:42:05 -03:00
|
|
|
|
|
|
|
|
|
if is_server_running(&llama_url).await {
|
|
|
|
|
Ok(HttpResponse::Ok().json(serde_json::json!({
|
|
|
|
|
"status": "healthy",
|
|
|
|
|
"llama_server": "running"
|
|
|
|
|
})))
|
|
|
|
|
} else {
|
|
|
|
|
Ok(HttpResponse::ServiceUnavailable().json(serde_json::json!({
|
|
|
|
|
"status": "unhealthy",
|
|
|
|
|
"llama_server": "not running"
|
|
|
|
|
})))
|
|
|
|
|
}
|
|
|
|
|
}
|