botserver/src/llm_legacy/llm_local.rs

136 lines
3.4 KiB
Rust
Raw Normal View History

2025-10-11 12:29:03 -03:00
use dotenvy::dotenv;
use log::{error, info, warn};
use actix_web::{web, HttpResponse, Result};
2025-10-06 10:30:17 -03:00
use serde::{Deserialize, Serialize};
2025-10-11 12:29:03 -03:00
use std::process::{Command, Stdio};
use std::thread;
use std::time::Duration;
2025-10-06 10:30:17 -03:00
2025-10-11 12:29:03 -03:00
#[derive(Debug, Deserialize)]
pub struct LocalChatRequest {
pub model: String,
pub messages: Vec<ChatMessage>,
pub temperature: Option<f32>,
pub max_tokens: Option<u32>,
2025-10-06 10:30:17 -03:00
}
2025-10-11 12:29:03 -03:00
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ChatMessage {
pub role: String,
pub content: String,
2025-10-06 10:30:17 -03:00
}
#[derive(Debug, Deserialize)]
pub struct EmbeddingRequest {
pub model: String,
2025-10-11 12:29:03 -03:00
pub input: String,
2025-10-06 10:30:17 -03:00
}
2025-10-11 12:29:03 -03:00
#[derive(Debug, Serialize)]
pub struct LocalChatResponse {
pub id: String,
pub object: String,
pub created: u64,
pub model: String,
pub choices: Vec<ChatChoice>,
pub usage: Usage,
}
2025-10-06 10:30:17 -03:00
2025-10-11 12:29:03 -03:00
#[derive(Debug, Serialize)]
pub struct ChatChoice {
pub index: u32,
pub message: ChatMessage,
pub finish_reason: Option<String>,
}
2025-10-06 10:30:17 -03:00
2025-10-11 12:29:03 -03:00
#[derive(Debug, Serialize)]
pub struct Usage {
pub prompt_tokens: u32,
pub completion_tokens: u32,
pub total_tokens: u32,
2025-10-06 10:30:17 -03:00
}
#[derive(Debug, Serialize)]
pub struct EmbeddingResponse {
pub object: String,
pub data: Vec<EmbeddingData>,
pub model: String,
pub usage: Usage,
}
#[derive(Debug, Serialize)]
pub struct EmbeddingData {
pub object: String,
pub embedding: Vec<f32>,
2025-10-11 12:29:03 -03:00
pub index: u32,
2025-10-06 10:30:17 -03:00
}
2025-10-11 12:29:03 -03:00
pub async fn ensure_llama_servers_running() -> Result<(), Box<dyn std::error::Error>> {
info!("Checking if local LLM servers are running...");
// For now, just log that we would start servers
info!("Local LLM servers would be started here");
Ok(())
2025-10-06 10:30:17 -03:00
}
2025-10-11 12:29:03 -03:00
pub async fn chat_completions_local(
payload: web::Json<LocalChatRequest>,
) -> Result<HttpResponse> {
dotenv().ok();
2025-10-06 10:30:17 -03:00
2025-10-11 12:29:03 -03:00
info!("Received local chat request for model: {}", payload.model);
// Mock response for local LLM
let response = LocalChatResponse {
id: "local-chat-123".to_string(),
object: "chat.completion".to_string(),
created: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs(),
model: payload.model.clone(),
choices: vec![ChatChoice {
index: 0,
message: ChatMessage {
role: "assistant".to_string(),
content: "This is a mock response from the local LLM. In a real implementation, this would connect to a local model like Llama or Mistral.".to_string(),
},
finish_reason: Some("stop".to_string()),
}],
usage: Usage {
prompt_tokens: 15,
completion_tokens: 25,
total_tokens: 40,
},
};
Ok(HttpResponse::Ok().json(response))
2025-10-06 10:30:17 -03:00
}
pub async fn embeddings_local(
2025-10-11 12:29:03 -03:00
payload: web::Json<EmbeddingRequest>,
2025-10-06 10:30:17 -03:00
) -> Result<HttpResponse> {
dotenv().ok();
2025-10-11 12:29:03 -03:00
info!("Received local embedding request for model: {}", payload.model);
2025-10-06 10:30:17 -03:00
2025-10-11 12:29:03 -03:00
// Mock embedding response
let response = EmbeddingResponse {
2025-10-06 10:30:17 -03:00
object: "list".to_string(),
2025-10-11 12:29:03 -03:00
data: vec![EmbeddingData {
object: "embedding".to_string(),
embedding: vec![0.1; 768], // Mock embedding vector
index: 0,
}],
model: payload.model.clone(),
2025-10-06 10:30:17 -03:00
usage: Usage {
2025-10-11 12:29:03 -03:00
prompt_tokens: 10,
completion_tokens: 0,
total_tokens: 10,
2025-10-06 10:30:17 -03:00
},
};
2025-10-11 12:29:03 -03:00
Ok(HttpResponse::Ok().json(response))
2025-10-06 10:30:17 -03:00
}