- Local LLM embeddings.
This commit is contained in:
parent
9eabb16425
commit
ed5caa64b6
6 changed files with 365 additions and 211 deletions
11
src/main.rs
11
src/main.rs
|
@ -10,7 +10,8 @@ use sqlx::PgPool;
|
|||
use crate::services::automation::AutomationService;
|
||||
use crate::services::email::{get_emails, list_emails, save_click, send_email};
|
||||
use crate::services::llm::{chat, chat_stream};
|
||||
use crate::services::llm_local::chat_completions_local;
|
||||
use crate::services::llm_local::ensure_llama_servers_running;
|
||||
use crate::services::llm_local::{chat_completions_local, embeddings_local};
|
||||
use crate::services::llm_provider::chat_completions;
|
||||
use crate::services::web_automation::{initialize_browser_pool, BrowserPool};
|
||||
|
||||
|
@ -38,14 +39,9 @@ async fn main() -> std::io::Result<()> {
|
|||
"/usr/bin/brave-browser-beta".to_string(),
|
||||
));
|
||||
|
||||
#[cfg(feature = "local_llm")]
|
||||
{
|
||||
use crate::services::llm_local::ensure_llama_server_running;
|
||||
|
||||
ensure_llama_server_running()
|
||||
ensure_llama_servers_running()
|
||||
.await
|
||||
.expect("Failed to initialize LLM local server.");
|
||||
}
|
||||
|
||||
initialize_browser_pool()
|
||||
.await
|
||||
|
@ -89,6 +85,7 @@ async fn main() -> std::io::Result<()> {
|
|||
.service(chat_completions)
|
||||
.service(chat_completions_local)
|
||||
.service(chat)
|
||||
.service(embeddings_local)
|
||||
})
|
||||
.bind((config.server.host.clone(), config.server.port))?
|
||||
.run()
|
||||
|
|
|
@ -13,7 +13,6 @@ sleep 15
|
|||
|
||||
lxc exec "$PARAM_TENANT"-bot -- bash -c "
|
||||
|
||||
|
||||
apt-get update && apt-get install -y \
|
||||
build-essential cmake git pkg-config libjpeg-dev libtiff-dev \
|
||||
libpng-dev libavcodec-dev libavformat-dev libswscale-dev \
|
||||
|
|
|
@ -17,6 +17,9 @@ apt-get update && apt-get install -y wget
|
|||
wget https://dl.min.io/server/minio/release/linux-amd64/minio -O /usr/local/bin/minio
|
||||
chmod +x /usr/local/bin/minio
|
||||
|
||||
wget https://dl.min.io/client/mc/release/linux-amd64/mc
|
||||
chmod +x /usr/local/bin/mc
|
||||
|
||||
useradd -r -s /bin/false minio-user || true
|
||||
mkdir -p /var/log/minio /data
|
||||
chown -R minio-user:minio-user /var/log/minio /data
|
||||
|
|
|
@ -17,12 +17,25 @@ sleep 15
|
|||
|
||||
lxc exec $CONTAINER_NAME -- bash -c '
|
||||
|
||||
apt-get update && apt-get install -y wget
|
||||
apt-get update && apt-get install -y wget unzip
|
||||
|
||||
|
||||
useradd -r -s /bin/false gbuser || true
|
||||
mkdir -p /opt/gbo/logs /opt/gbo/bin /opt/gbo/data /opt/gbo/conf
|
||||
chown -R gbuser:gbuser /opt/gbo/
|
||||
|
||||
wget https://github.com/ggml-org/llama.cpp/releases/download/b6148/llama-b6148-bin-ubuntu-x64.zip
|
||||
mkdir llama.cpp
|
||||
mv llama-b6148-bin-ubuntu-x64.zip llama.cpp
|
||||
cd llama.cpp
|
||||
unzip llama-b6148-bin-ubuntu-x64.zip
|
||||
mv build/bin/* .
|
||||
rm build/bin -r
|
||||
rm llama-b6148-bin-ubuntu-x64.zip
|
||||
|
||||
wget https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf
|
||||
wget https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-f32.gguf
|
||||
|
||||
cat > /etc/systemd/system/system.service <<EOF
|
||||
[Unit]
|
||||
Description=General Bots System Service
|
||||
|
|
117
src/scripts/containers/tables.sh
Normal file → Executable file
117
src/scripts/containers/tables.sh
Normal file → Executable file
|
@ -1,3 +1,9 @@
|
|||
PARAM_TENANT=dantebot1
|
||||
PARAM_BOT_PORT=4242
|
||||
|
||||
PARAM_TABLES_USER=aQPJo0NKgozT0d751
|
||||
PARAM_TABLES_PORT=4444
|
||||
PARAM_TABLES_PASSWORD=JAlRDSHsNhqKydFy1
|
||||
|
||||
HOST_BASE="/opt/gbo/tenants/$PARAM_TENANT/tables"
|
||||
HOST_DATA="$HOST_BASE/data"
|
||||
|
@ -9,7 +15,6 @@ mkdir -p "$HOST_DATA" "$HOST_CONF" "$HOST_LOGS"
|
|||
lxc launch images:debian/12 "$PARAM_TENANT"-tables -c security.privileged=true
|
||||
|
||||
until lxc exec "$PARAM_TENANT"-tables -- test -f /bin/bash; do
|
||||
|
||||
sleep 5
|
||||
done
|
||||
sleep 10
|
||||
|
@ -17,75 +22,73 @@ sleep 10
|
|||
lxc exec "$PARAM_TENANT"-tables -- bash -c "
|
||||
set -e
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
apt-get update
|
||||
apt-get install -y wget gnupg2 sudo lsb-release
|
||||
CODENAME=\$(lsb_release -cs)
|
||||
apt-get install -y wget gnupg2 sudo lsb-release curl
|
||||
|
||||
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor > /etc/apt/trusted.gpg.d/postgresql.gpg
|
||||
apt-get install -y postgresql-14 postgresql-client-14
|
||||
if ! id postgres &>/dev/null; then
|
||||
sudo apt install -y postgresql-common
|
||||
sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh
|
||||
apt install -y postgresql
|
||||
|
||||
exit 1
|
||||
systemctl stop postgresql
|
||||
|
||||
mkdir -p /etc/systemd/system/postgresql.service.d/
|
||||
cat > /etc/systemd/system/postgresql.service.d/override.conf <<EOF
|
||||
[Service]
|
||||
Environment=\"PGPORT=$PARAM_TABLES_PORT\"
|
||||
EOF
|
||||
|
||||
CONF_FILE=\$(find /etc/postgresql -name postgresql.conf | head -1)
|
||||
if [ -f \"\$CONF_FILE\" ]; then
|
||||
cp \"\$CONF_FILE\" \"\${CONF_FILE}.bak\"
|
||||
sed -i \"s/^#*port *=.*/port = $PARAM_TABLES_PORT/\" \"\$CONF_FILE\"
|
||||
sed -i \"s/^#*listen_addresses *=.*/listen_addresses = '*'/\" \"\$CONF_FILE\"
|
||||
|
||||
HBA_FILE=\$(find /etc/postgresql -name pg_hba.conf | head -1)
|
||||
if [ -f \"\$HBA_FILE\" ]; then
|
||||
echo 'host all all 0.0.0.0/0 md5' >> \"\$HBA_FILE\"
|
||||
fi
|
||||
fi
|
||||
systemctl stop postgresql@14-main 2>/dev/null || systemctl stop postgresql 2>/dev/null || true
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl start postgresql
|
||||
systemctl enable postgresql
|
||||
|
||||
until sudo -u postgres psql -p $PARAM_TABLES_PORT -c '\q' 2>/dev/null; do
|
||||
echo \"Waiting for PostgreSQL to start on port $PARAM_TABLES_PORT...\"
|
||||
sleep 3
|
||||
done
|
||||
|
||||
sudo -u postgres psql -p $PARAM_TABLES_PORT -c \"CREATE USER $PARAM_TENANT WITH PASSWORD '$PARAM_TABLES_PASSWORD';\"
|
||||
sudo -u postgres psql -p $PARAM_TABLES_PORT -c \"CREATE DATABASE ${PARAM_TENANT}_db OWNER $PARAM_TENANT;\"
|
||||
sudo -u postgres psql -p $PARAM_TABLES_PORT -c \"GRANT ALL PRIVILEGES ON DATABASE ${PARAM_TENANT}_db TO $PARAM_TENANT;\"
|
||||
|
||||
systemctl restart postgresql
|
||||
"
|
||||
|
||||
POSTGRES_UID=$(lxc exec "$PARAM_TENANT"-tables -- id -u postgres)
|
||||
POSTGRES_GID=$(lxc exec "$PARAM_TENANT"-tables -- id -g postgres)
|
||||
lxc exec "$PARAM_TENANT"-tables -- systemctl stop postgresql
|
||||
|
||||
HOST_POSTGRES_UID=$((100000 + POSTGRES_UID))
|
||||
HOST_POSTGRES_GID=$((100000 + POSTGRES_GID))
|
||||
PG_DATA_DIR=$(lxc exec "$PARAM_TENANT"-tables -- bash -c "find /var/lib/postgresql -name main -type d | head -1")
|
||||
PG_CONF_DIR=$(lxc exec "$PARAM_TENANT"-tables -- bash -c "find /etc/postgresql -name main -type d | head -1")
|
||||
PG_LOGS_DIR=$(lxc exec "$PARAM_TENANT"-tables -- bash -c "find /var/log/postgresql -name postgresql-*.log -o -name postgresql.log | head -1 | xargs dirname 2>/dev/null || echo /var/log/postgresql")
|
||||
|
||||
chown -R "$HOST_POSTGRES_UID:$HOST_POSTGRES_GID" "$HOST_BASE"
|
||||
chmod -R 750 "$HOST_BASE"
|
||||
lxc config device add "$PARAM_TENANT"-tables pgdata disk source="$HOST_DATA" path="$PG_DATA_DIR"
|
||||
lxc config device add "$PARAM_TENANT"-tables pgconf disk source="$HOST_CONF" path="$PG_CONF_DIR"
|
||||
lxc config device add "$PARAM_TENANT"-tables pglogs disk source="$HOST_LOGS" path="$PG_LOGS_DIR"
|
||||
|
||||
lxc config device add "$PARAM_TENANT"-tables pgdata disk source="$HOST_DATA" path=/var/lib/postgresql/14/main
|
||||
lxc config device add "$PARAM_TENANT"-tables pgconf disk source="$HOST_CONF" path=/etc/postgresql/14/main
|
||||
lxc config device add "$PARAM_TENANT"-tables pglogs disk source="$HOST_LOGS" path=/var/log/postgresql
|
||||
lxc exec "$PARAM_TENANT"-tables -- chown -R postgres:postgres "$PG_DATA_DIR"
|
||||
lxc exec "$PARAM_TENANT"-tables -- chown -R postgres:postgres "$PG_CONF_DIR"
|
||||
lxc exec "$PARAM_TENANT"-tables -- chown -R postgres:postgres "$PG_LOGS_DIR"
|
||||
|
||||
mkdir -p /var/lib/postgresql/14/main
|
||||
mkdir -p /etc/postgresql/14/main
|
||||
mkdir -p /var/log/postgresql
|
||||
chown -R postgres:postgres /var/lib/postgresql/14/main
|
||||
chown -R postgres:postgres /etc/postgresql/14/main
|
||||
chown -R postgres:postgres /var/log/postgresql
|
||||
chmod 700 /var/lib/postgresql/14/main
|
||||
|
||||
sudo -u postgres /usr/lib/postgresql/14/bin/initdb -D /var/lib/postgresql/14/main
|
||||
|
||||
cat > /etc/postgresql/14/main/postgresql.conf <<EOF
|
||||
data_directory = '/var/lib/postgresql/14/main'
|
||||
hba_file = '/etc/postgresql/14/main/pg_hba.conf'
|
||||
ident_file = '/etc/postgresql/14/main/pg_ident.conf'
|
||||
listen_addresses = '*'
|
||||
port = $PARAM_TABLES_PORT
|
||||
max_connections = 100
|
||||
shared_buffers = 128MB
|
||||
log_destination = 'stderr'
|
||||
logging_collector = on
|
||||
log_directory = '/var/log/postgresql'
|
||||
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
|
||||
EOF
|
||||
|
||||
cat > /etc/postgresql/14/main/pg_hba.conf <<EOF
|
||||
local all postgres peer
|
||||
local all all peer
|
||||
host all all 127.0.0.1/32 md5
|
||||
host all all ::1/128 md5
|
||||
host all all 0.0.0.0/0 md5
|
||||
systemctl start postgresql@14-main
|
||||
systemctl enable postgresql@14-main
|
||||
EOF
|
||||
lxc exec "$PARAM_TENANT"-tables -- systemctl start postgresql
|
||||
|
||||
lxc config device remove "$PARAM_TENANT"-tables postgres-proxy 2>/dev/null || true
|
||||
lxc config device add "$PARAM_TENANT"-tables postgres-proxy proxy \
|
||||
listen=tcp:0.0.0.0:"$PARAM_TABLES_PORT" \
|
||||
connect=tcp:127.0.0.1:"$PARAM_TABLES_PORT"
|
||||
|
||||
cd /var/lib/postgresql
|
||||
until sudo -u postgres psql -p $PARAM_TABLES_PORT -c '\q' 2>/dev/null; do
|
||||
|
||||
sleep 3
|
||||
sudo -u "$PARAM_TABLES_USER" psql -p $PARAM_TABLES_PORT -c \"CREATE USER $PARAM_TENANT WITH PASSWORD '$PARAM_TABLES_PASSWORD';\" 2>/dev/null
|
||||
sudo -u "$PARAM_TABLES_USER" psql -p $PARAM_TABLES_PORT -c \"CREATE DATABASE ${PARAM_TENANT}_db OWNER $PARAM_TENANT;\" 2>/dev/null
|
||||
sudo -u "$PARAM_TABLES_USER" psql -p $PARAM_TABLES_PORT -c \"GRANT ALL PRIVILEGES ON DATABASE ${PARAM_TENANT}_db TO $PARAM_TENANT;\" 2>/dev/null
|
||||
echo "PostgreSQL setup completed successfully!"
|
||||
echo "Database: ${PARAM_TENANT}_db"
|
||||
echo "User: $PARAM_TENANT"
|
||||
echo "Password: $PARAM_TABLES_PASSWORD"
|
||||
echo "Port: $PARAM_TABLES_PORT"
|
||||
|
|
|
@ -59,157 +59,170 @@ struct LlamaCppResponse {
|
|||
generation_settings: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
// Function to check if server is running
|
||||
async fn is_server_running(url: &str) -> bool {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(3))
|
||||
.build()
|
||||
.unwrap();
|
||||
pub async fn ensure_llama_servers_running() -> Result<(), Box<dyn std::error::Error + Send + Sync>>
|
||||
{
|
||||
let llm_local = env::var("LLM_LOCAL").unwrap_or_else(|_| "false".to_string());
|
||||
|
||||
match client.get(&format!("{}/health", url)).send().await {
|
||||
Ok(response) => {
|
||||
let is_ok = response.status().is_success();
|
||||
if is_ok {
|
||||
println!("🟢 Server health check: OK");
|
||||
} else {
|
||||
println!(
|
||||
"🔴 Server health check: Failed with status {}",
|
||||
response.status()
|
||||
);
|
||||
}
|
||||
is_ok
|
||||
}
|
||||
Err(e) => {
|
||||
println!("🔴 Server health check: Connection failed - {}", e);
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Function to start llama.cpp server
|
||||
async fn start_llama_server() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
println!("🚀 Starting llama.cpp server...");
|
||||
|
||||
// Get environment variables for llama.cpp configuration
|
||||
let llama_path = env::var("LLM_CPP_PATH").unwrap_or_else(|_| "llama-server".to_string());
|
||||
let model_path = env::var("LLM_MODEL_PATH")
|
||||
.unwrap_or_else(|_| "./tinyllama-1.1b-chat-v1.0.Q4_0.gguf".to_string());
|
||||
let cpu_limit = env::var("CPU_LIMIT").unwrap_or_else(|_| "50".to_string());
|
||||
let port = env::var("LLM_PORT").unwrap_or_else(|_| "8080".to_string());
|
||||
|
||||
println!("🔧 Configuration:");
|
||||
println!(" - Llama path: {}", llama_path);
|
||||
println!(" - Model path: {}", model_path);
|
||||
println!(" - CPU limit: {}%", cpu_limit);
|
||||
println!(" - Port: {}", port);
|
||||
|
||||
// Kill any existing llama processes
|
||||
println!("🧹 Cleaning up existing processes...");
|
||||
let _ = Command::new("pkill").arg("-f").arg("llama-cli").output();
|
||||
|
||||
// Wait a bit for cleanup
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
// Build the command
|
||||
let full_command = format!(
|
||||
"{}/llama-server -m {} --mlock --port {} --host 127.0.0.1",
|
||||
llama_path, model_path, port
|
||||
);
|
||||
|
||||
println!("📝 Executing command: {}", full_command);
|
||||
|
||||
// Start llama.cpp server with cpulimit using tokio
|
||||
let mut cmd = TokioCommand::new("sh");
|
||||
cmd.arg("-c");
|
||||
cmd.arg(&full_command);
|
||||
cmd.stdout(Stdio::piped());
|
||||
cmd.stderr(Stdio::piped());
|
||||
cmd.kill_on_drop(true);
|
||||
|
||||
let mut child = cmd
|
||||
.spawn()
|
||||
.map_err(|e| format!("Failed to start llama.cpp server: {}", e))?;
|
||||
|
||||
println!("🔄 Process spawned with PID: {:?}", child.id());
|
||||
|
||||
// Capture stdout and stderr for real-time logging
|
||||
if let Some(stdout) = child.stdout.take() {
|
||||
let stdout_reader = BufReader::new(stdout);
|
||||
tokio::spawn(async move {
|
||||
let mut lines = stdout_reader.lines();
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
println!("🦙📤 STDOUT: {}", line);
|
||||
}
|
||||
println!("🦙📤 STDOUT stream ended");
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(stderr) = child.stderr.take() {
|
||||
let stderr_reader = BufReader::new(stderr);
|
||||
tokio::spawn(async move {
|
||||
let mut lines = stderr_reader.lines();
|
||||
while let Ok(Some(line)) = lines.next_line().await {
|
||||
println!("🦙📥 STDERR: {}", line);
|
||||
}
|
||||
println!("🦙📥 STDERR stream ended");
|
||||
});
|
||||
}
|
||||
|
||||
// Store the process handle
|
||||
unsafe {
|
||||
LLAMA_PROCESS = Some(Arc::new(Mutex::new(Some(child))));
|
||||
}
|
||||
|
||||
println!("✅ Llama.cpp server process started!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Function to ensure llama.cpp server is running
|
||||
pub async fn ensure_llama_server_running() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8080".to_string());
|
||||
|
||||
// Check if server is already running
|
||||
if is_server_running(&llama_url).await {
|
||||
println!("✅ Llama.cpp server is already running");
|
||||
if llm_local.to_lowercase() != "true" {
|
||||
println!("ℹ️ LLM_LOCAL is not enabled, skipping local server startup");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Start the server
|
||||
start_llama_server().await?;
|
||||
// Get configuration from environment variables
|
||||
let llm_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string());
|
||||
let embedding_url =
|
||||
env::var("EMBEDDING_URL").unwrap_or_else(|_| "http://localhost:8082".to_string());
|
||||
let llama_cpp_path = env::var("LLM_CPP_PATH").unwrap_or_else(|_| "~/llama.cpp".to_string());
|
||||
let llm_model_path = env::var("LLM_MODEL_PATH").unwrap_or_else(|_| "".to_string());
|
||||
let embedding_model_path = env::var("EMBEDDING_MODEL_PATH").unwrap_or_else(|_| "".to_string());
|
||||
|
||||
println!("🚀 Starting local llama.cpp servers...");
|
||||
println!("📋 Configuration:");
|
||||
println!(" LLM URL: {}", llm_url);
|
||||
println!(" Embedding URL: {}", embedding_url);
|
||||
println!(" LLM Model: {}", llm_model_path);
|
||||
println!(" Embedding Model: {}", embedding_model_path);
|
||||
|
||||
// Check if servers are already running
|
||||
let llm_running = is_server_running(&llm_url).await;
|
||||
let embedding_running = is_server_running(&embedding_url).await;
|
||||
|
||||
if llm_running && embedding_running {
|
||||
println!("✅ Both LLM and Embedding servers are already running");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Start servers that aren't running
|
||||
let mut tasks = vec![];
|
||||
|
||||
if !llm_running && !llm_model_path.is_empty() {
|
||||
println!("🔄 Starting LLM server...");
|
||||
tasks.push(tokio::spawn(start_llm_server(
|
||||
llama_cpp_path.clone(),
|
||||
llm_model_path.clone(),
|
||||
llm_url.clone(),
|
||||
)));
|
||||
} else if llm_model_path.is_empty() {
|
||||
println!("⚠️ LLM_MODEL_PATH not set, skipping LLM server");
|
||||
}
|
||||
|
||||
if !embedding_running && !embedding_model_path.is_empty() {
|
||||
println!("🔄 Starting Embedding server...");
|
||||
tasks.push(tokio::spawn(start_embedding_server(
|
||||
llama_cpp_path.clone(),
|
||||
embedding_model_path.clone(),
|
||||
embedding_url.clone(),
|
||||
)));
|
||||
} else if embedding_model_path.is_empty() {
|
||||
println!("⚠️ EMBEDDING_MODEL_PATH not set, skipping Embedding server");
|
||||
}
|
||||
|
||||
// Wait for all server startup tasks
|
||||
for task in tasks {
|
||||
task.await??;
|
||||
}
|
||||
|
||||
// Wait for servers to be ready with verbose logging
|
||||
println!("⏳ Waiting for servers to become ready...");
|
||||
|
||||
let mut llm_ready = llm_running || llm_model_path.is_empty();
|
||||
let mut embedding_ready = embedding_running || embedding_model_path.is_empty();
|
||||
|
||||
// Wait for server to be ready with verbose logging
|
||||
println!("⏳ Waiting for llama.cpp server to become ready...");
|
||||
let mut attempts = 0;
|
||||
let max_attempts = 60; // 2 minutes total
|
||||
|
||||
while attempts < max_attempts {
|
||||
while attempts < max_attempts && (!llm_ready || !embedding_ready) {
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
|
||||
print!(
|
||||
"🔍 Checking server health (attempt {}/{})... ",
|
||||
println!(
|
||||
"🔍 Checking server health (attempt {}/{})...",
|
||||
attempts + 1,
|
||||
max_attempts
|
||||
);
|
||||
|
||||
if is_server_running(&llama_url).await {
|
||||
println!("✅ SUCCESS!");
|
||||
println!("🎉 Llama.cpp server is ready and responding!");
|
||||
return Ok(());
|
||||
if !llm_ready && !llm_model_path.is_empty() {
|
||||
if is_server_running(&llm_url).await {
|
||||
println!(" ✅ LLM server ready at {}", llm_url);
|
||||
llm_ready = true;
|
||||
} else {
|
||||
println!("❌ Not ready yet");
|
||||
println!(" ❌ LLM server not ready yet");
|
||||
}
|
||||
}
|
||||
|
||||
if !embedding_ready && !embedding_model_path.is_empty() {
|
||||
if is_server_running(&embedding_url).await {
|
||||
println!(" ✅ Embedding server ready at {}", embedding_url);
|
||||
embedding_ready = true;
|
||||
} else {
|
||||
println!(" ❌ Embedding server not ready yet");
|
||||
}
|
||||
}
|
||||
|
||||
attempts += 1;
|
||||
|
||||
if attempts % 10 == 0 {
|
||||
println!(
|
||||
"⏰ Still waiting for llama.cpp server... (attempt {}/{})",
|
||||
"⏰ Still waiting for servers... (attempt {}/{})",
|
||||
attempts, max_attempts
|
||||
);
|
||||
println!("💡 Check the logs above for any errors from the llama server");
|
||||
}
|
||||
}
|
||||
|
||||
Err("❌ Llama.cpp server failed to start within timeout (2 minutes)".into())
|
||||
if llm_ready && embedding_ready {
|
||||
println!("🎉 All llama.cpp servers are ready and responding!");
|
||||
Ok(())
|
||||
} else {
|
||||
let mut error_msg = "❌ Servers failed to start within timeout:".to_string();
|
||||
if !llm_ready && !llm_model_path.is_empty() {
|
||||
error_msg.push_str(&format!("\n - LLM server at {}", llm_url));
|
||||
}
|
||||
if !embedding_ready && !embedding_model_path.is_empty() {
|
||||
error_msg.push_str(&format!("\n - Embedding server at {}", embedding_url));
|
||||
}
|
||||
Err(error_msg.into())
|
||||
}
|
||||
}
|
||||
|
||||
async fn start_llm_server(
|
||||
llama_cpp_path: String,
|
||||
model_path: String,
|
||||
url: String,
|
||||
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let port = url.split(':').last().unwrap_or("8081");
|
||||
|
||||
let mut cmd = tokio::process::Command::new("sh");
|
||||
cmd.arg("-c").arg(format!(
|
||||
"cd {} && ./llama-server -m {} --host 0.0.0.0 --port {} --n-gpu-layers 99 &",
|
||||
llama_cpp_path, model_path, port
|
||||
));
|
||||
|
||||
cmd.spawn()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_embedding_server(
|
||||
llama_cpp_path: String,
|
||||
model_path: String,
|
||||
url: String,
|
||||
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let port = url.split(':').last().unwrap_or("8082");
|
||||
|
||||
let mut cmd = tokio::process::Command::new("sh");
|
||||
cmd.arg("-c").arg(format!(
|
||||
"cd {} && ./llama-server -m {} --host 0.0.0.0 --port {} --embedding --n-gpu-layers 99 &",
|
||||
llama_cpp_path, model_path, port
|
||||
));
|
||||
|
||||
cmd.spawn()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn is_server_running(url: &str) -> bool {
|
||||
let client = reqwest::Client::new();
|
||||
match client.get(&format!("{}/health", url)).send().await {
|
||||
Ok(response) => response.status().is_success(),
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
// Convert OpenAI chat messages to a single prompt
|
||||
|
@ -238,26 +251,15 @@ fn messages_to_prompt(messages: &[ChatMessage]) -> String {
|
|||
}
|
||||
|
||||
// Proxy endpoint
|
||||
#[post("/v1/chat/completions1")]
|
||||
#[post("/v1/chat/completions")]
|
||||
pub async fn chat_completions_local(
|
||||
req_body: web::Json<ChatCompletionRequest>,
|
||||
_req: HttpRequest,
|
||||
) -> Result<HttpResponse> {
|
||||
dotenv().ok().unwrap();
|
||||
|
||||
// Ensure llama.cpp server is running
|
||||
if let Err(e) = ensure_llama_server_running().await {
|
||||
eprintln!("Failed to start llama.cpp server: {}", e);
|
||||
return Ok(HttpResponse::InternalServerError().json(serde_json::json!({
|
||||
"error": {
|
||||
"message": format!("Failed to start llama.cpp server: {}", e),
|
||||
"type": "server_error"
|
||||
}
|
||||
})));
|
||||
}
|
||||
|
||||
// Get llama.cpp server URL
|
||||
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8080".to_string());
|
||||
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string());
|
||||
|
||||
// Convert OpenAI format to llama.cpp format
|
||||
let prompt = messages_to_prompt(&req_body.messages);
|
||||
|
@ -342,10 +344,147 @@ pub async fn chat_completions_local(
|
|||
}
|
||||
}
|
||||
|
||||
// OpenAI Embedding Request
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct EmbeddingRequest {
|
||||
pub input: Vec<String>,
|
||||
pub model: String,
|
||||
#[serde(default)]
|
||||
pub encoding_format: Option<String>,
|
||||
}
|
||||
|
||||
// OpenAI Embedding Response
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct EmbeddingResponse {
|
||||
pub object: String,
|
||||
pub data: Vec<EmbeddingData>,
|
||||
pub model: String,
|
||||
pub usage: Usage,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct EmbeddingData {
|
||||
pub object: String,
|
||||
pub embedding: Vec<f32>,
|
||||
pub index: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Usage {
|
||||
pub prompt_tokens: u32,
|
||||
pub total_tokens: u32,
|
||||
}
|
||||
|
||||
// Llama.cpp Embedding Request
|
||||
#[derive(Debug, Serialize)]
|
||||
struct LlamaCppEmbeddingRequest {
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
// Llama.cpp Embedding Response
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct LlamaCppEmbeddingResponse {
|
||||
pub embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
// Proxy endpoint for embeddings
|
||||
#[post("/v1/embeddings")]
|
||||
pub async fn embeddings_local(
|
||||
req_body: web::Json<EmbeddingRequest>,
|
||||
_req: HttpRequest,
|
||||
) -> Result<HttpResponse> {
|
||||
dotenv().ok();
|
||||
|
||||
// Get llama.cpp server URL
|
||||
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8082".to_string());
|
||||
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(120))
|
||||
.build()
|
||||
.map_err(|e| {
|
||||
eprintln!("Error creating HTTP client: {}", e);
|
||||
actix_web::error::ErrorInternalServerError("Failed to create HTTP client")
|
||||
})?;
|
||||
|
||||
// Process each input text and get embeddings
|
||||
let mut embeddings_data = Vec::new();
|
||||
let mut total_tokens = 0;
|
||||
|
||||
for (index, input_text) in req_body.input.iter().enumerate() {
|
||||
let llama_request = LlamaCppEmbeddingRequest {
|
||||
content: input_text.clone(),
|
||||
};
|
||||
|
||||
let response = client
|
||||
.post(&format!("{}/embedding", llama_url))
|
||||
.header("Content-Type", "application/json")
|
||||
.json(&llama_request)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
eprintln!("Error calling llama.cpp server for embedding: {}", e);
|
||||
actix_web::error::ErrorInternalServerError(
|
||||
"Failed to call llama.cpp server for embedding",
|
||||
)
|
||||
})?;
|
||||
|
||||
let status = response.status();
|
||||
|
||||
if status.is_success() {
|
||||
let llama_response: LlamaCppEmbeddingResponse = response.json().await.map_err(|e| {
|
||||
eprintln!("Error parsing llama.cpp embedding response: {}", e);
|
||||
actix_web::error::ErrorInternalServerError(
|
||||
"Failed to parse llama.cpp embedding response",
|
||||
)
|
||||
})?;
|
||||
|
||||
// Estimate token count (this is approximate since llama.cpp doesn't return token count for embeddings)
|
||||
let estimated_tokens = (input_text.len() as f32 / 4.0).ceil() as u32;
|
||||
total_tokens += estimated_tokens;
|
||||
|
||||
embeddings_data.push(EmbeddingData {
|
||||
object: "embedding".to_string(),
|
||||
embedding: llama_response.embedding,
|
||||
index,
|
||||
});
|
||||
} else {
|
||||
let error_text = response
|
||||
.text()
|
||||
.await
|
||||
.unwrap_or_else(|_| "Unknown error".to_string());
|
||||
|
||||
eprintln!("Llama.cpp server error ({}): {}", status, error_text);
|
||||
|
||||
let actix_status = actix_web::http::StatusCode::from_u16(status.as_u16())
|
||||
.unwrap_or(actix_web::http::StatusCode::INTERNAL_SERVER_ERROR);
|
||||
|
||||
return Ok(HttpResponse::build(actix_status).json(serde_json::json!({
|
||||
"error": {
|
||||
"message": format!("Failed to get embedding for input {}: {}", index, error_text),
|
||||
"type": "server_error"
|
||||
}
|
||||
})));
|
||||
}
|
||||
}
|
||||
|
||||
// Build OpenAI-compatible response
|
||||
let openai_response = EmbeddingResponse {
|
||||
object: "list".to_string(),
|
||||
data: embeddings_data,
|
||||
model: req_body.model.clone(),
|
||||
usage: Usage {
|
||||
prompt_tokens: total_tokens,
|
||||
total_tokens,
|
||||
},
|
||||
};
|
||||
|
||||
Ok(HttpResponse::Ok().json(openai_response))
|
||||
}
|
||||
|
||||
// Health check endpoint
|
||||
#[actix_web::get("/health")]
|
||||
pub async fn health() -> Result<HttpResponse> {
|
||||
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8080".to_string());
|
||||
let llama_url = env::var("LLM_URL").unwrap_or_else(|_| "http://localhost:8081".to_string());
|
||||
|
||||
if is_server_running(&llama_url).await {
|
||||
Ok(HttpResponse::Ok().json(serde_json::json!({
|
||||
|
|
Loading…
Add table
Reference in a new issue