From 051c8f720c6c1fcebf1bc392007169ccd55d89fd Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Wed, 10 Dec 2025 08:43:27 -0300 Subject: [PATCH] fix(llm): Compile llama.cpp from source for CPU compatibility Instead of downloading pre-built binaries (which may require AVX2), compile llama.cpp from source during installation. This ensures: - Works on older CPUs (Sandy Bridge, Haswell, etc.) - Uses GGML_NATIVE=ON to optimize for the current CPU - Binary path updated to build/bin/llama-server Reverts the AVX2 detection that was incorrectly disabling LLM. --- src/core/package_manager/installer.rs | 73 +++++---------------------- src/llm/local.rs | 46 ----------------- 2 files changed, 14 insertions(+), 105 deletions(-) diff --git a/src/core/package_manager/installer.rs b/src/core/package_manager/installer.rs index de68c0d83..f0140b270 100644 --- a/src/core/package_manager/installer.rs +++ b/src/core/package_manager/installer.rs @@ -6,24 +6,6 @@ use log::{info, trace, warn}; use std::collections::HashMap; use std::path::PathBuf; -/// Check if the CPU supports AVX2 instructions (required for pre-built llama.cpp binaries) -fn cpu_supports_avx2() -> bool { - #[cfg(target_arch = "x86_64")] - { - // Read /proc/cpuinfo on Linux to check for avx2 flag - if let Ok(cpuinfo) = std::fs::read_to_string("/proc/cpuinfo") { - return cpuinfo.contains(" avx2 ") || cpuinfo.contains(" avx2\n"); - } - // Fallback: assume AVX2 is not available if we can't read cpuinfo - false - } - #[cfg(not(target_arch = "x86_64"))] - { - // Non-x86_64 architectures (ARM, etc.) don't use AVX2 - false - } -} - #[derive(Debug)] pub struct PackageManager { pub mode: InstallMode, @@ -217,43 +199,8 @@ impl PackageManager { } fn register_llm(&mut self) { - // Check CPU capabilities - pre-built llama.cpp binaries require AVX2 - let has_avx2 = cpu_supports_avx2(); - - if !has_avx2 { - warn!("CPU does not support AVX2 instructions. Local LLM will not be available."); - warn!("To use local LLM on this CPU, you need to compile llama.cpp from source."); - warn!( - "Alternatively, configure an external LLM API (OpenAI, Anthropic, etc.) in Vault." - ); - // Register a disabled LLM component that won't download or run anything - self.components.insert( - "llm".to_string(), - ComponentConfig { - name: "llm".to_string(), - ports: vec![8081, 8082], - dependencies: vec![], - linux_packages: vec![], - macos_packages: vec![], - windows_packages: vec![], - download_url: None, // Don't download - CPU not compatible - binary_name: None, - pre_install_cmds_linux: vec![], - post_install_cmds_linux: vec![], - pre_install_cmds_macos: vec![], - post_install_cmds_macos: vec![], - pre_install_cmds_windows: vec![], - post_install_cmds_windows: vec![], - env_vars: HashMap::new(), - data_download_list: vec![], // Don't download models - exec_cmd: "echo 'LLM disabled - CPU does not support AVX2'".to_string(), - check_cmd: "false".to_string(), // Always fail check - LLM not available - }, - ); - return; - } - - info!("CPU supports AVX2 - local LLM will be available"); + // llama.cpp is compiled from source for maximum CPU compatibility + // This ensures it works on older CPUs (Sandy Bridge, etc.) without AVX2 self.components.insert( "llm".to_string(), ComponentConfig { @@ -265,11 +212,19 @@ impl PackageManager { macos_packages: vec![], windows_packages: vec![], download_url: Some( - "https://github.com/ggml-org/llama.cpp/releases/download/b6148/llama-b6148-bin-ubuntu-x64.zip".to_string(), + "https://github.com/ggml-org/llama.cpp/archive/refs/tags/b4967.zip".to_string(), ), binary_name: Some("llama-server".to_string()), - pre_install_cmds_linux: vec![], - post_install_cmds_linux: vec![], + pre_install_cmds_linux: vec![ + // Install build dependencies + "which cmake >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y cmake build-essential)".to_string(), + ], + post_install_cmds_linux: vec![ + // Compile llama.cpp from source for this CPU's instruction set + "cd {{BIN_PATH}} && if [ -d llama.cpp-b4967 ]; then mv llama.cpp-b4967/* . && rmdir llama.cpp-b4967; fi".to_string(), + "cd {{BIN_PATH}} && mkdir -p build && cd build && cmake .. -DGGML_NATIVE=ON -DGGML_CPU_ALL_VARIANTS=OFF && cmake --build . --config Release -j$(nproc)".to_string(), + "echo 'llama.cpp compiled successfully for this CPU'".to_string(), + ], pre_install_cmds_macos: vec![], post_install_cmds_macos: vec![], pre_install_cmds_windows: vec![], @@ -283,7 +238,7 @@ impl PackageManager { // GPT-OSS 20B F16 - Recommended for small GPU (16GB VRAM), no CPU // Uncomment to download: "https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/main/gpt-oss-20b-F16.gguf".to_string(), ], - exec_cmd: "nohup {{BIN_PATH}}/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(), + exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(), check_cmd: "curl -f -k https://localhost:8081/health >/dev/null 2>&1 && curl -f -k https://localhost:8082/health >/dev/null 2>&1".to_string(), }, ); diff --git a/src/llm/local.rs b/src/llm/local.rs index 8d26ef4cf..b8750aa5a 100644 --- a/src/llm/local.rs +++ b/src/llm/local.rs @@ -4,7 +4,6 @@ use crate::shared::state::AppState; use diesel::prelude::*; use log::{error, info, warn}; use reqwest; -use std::path::Path; use std::sync::Arc; use tokio; @@ -69,51 +68,6 @@ pub async fn ensure_llama_servers_running( info!(" LLM Model: {}", llm_model); info!(" Embedding Model: {}", embedding_model); info!(" LLM Server Path: {}", llm_server_path); - - // Check if llama-server binary exists - let llama_server_path = if llm_server_path.is_empty() { - "./botserver-stack/bin/llm/build/bin/llama-server".to_string() - } else { - format!("{}/llama-server", llm_server_path) - }; - - if !Path::new(&llama_server_path).exists() { - warn!("llama-server binary not found at: {}", llama_server_path); - warn!("Local LLM server will not be available."); - warn!("This may be because:"); - warn!(" 1. The LLM component was not installed (check if CPU supports AVX2)"); - warn!(" 2. The binary path is incorrect"); - warn!("Continuing without local LLM - use external LLM API instead."); - return Ok(()); - } - - // Test if the binary can actually run (check for illegal instruction) - info!("Testing llama-server binary compatibility..."); - let test_result = std::process::Command::new(&llama_server_path) - .arg("--version") - .output(); - - match test_result { - Ok(output) => { - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - warn!("llama-server test failed: {}", stderr); - if stderr.contains("Illegal instruction") { - error!("CPU does not support required instructions (AVX2) for llama-server"); - error!("Your CPU: Check /proc/cpuinfo for 'avx2' flag"); - error!("Options:"); - error!(" 1. Compile llama.cpp from source with your CPU's instruction set"); - error!(" 2. Use an external LLM API (OpenAI, Anthropic, etc.)"); - return Ok(()); - } - } - } - Err(e) => { - warn!("Failed to test llama-server: {}", e); - // Continue anyway - might work at runtime - } - } - info!("Restarting any existing llama-server processes..."); if let Err(e) = tokio::process::Command::new("sh")