From 051c8f720c6c1fcebf1bc392007169ccd55d89fd Mon Sep 17 00:00:00 2001
From: "Rodrigo Rodriguez (Pragmatismo)" <me@rodrigorodriguez.com>
Date: Wed, 10 Dec 2025 08:43:27 -0300
Subject: [PATCH] fix(llm): Compile llama.cpp from source for CPU compatibility

Instead of downloading pre-built binaries (which may require AVX2),
compile llama.cpp from source during installation. This ensures:
- Works on older CPUs (Sandy Bridge, Haswell, etc.)
- Uses GGML_NATIVE=ON to optimize for the current CPU
- Binary path updated to build/bin/llama-server

Reverts the AVX2 detection that was incorrectly disabling LLM.
---
 src/core/package_manager/installer.rs | 73 +++++----------------------
 src/llm/local.rs                      | 46 -----------------
 2 files changed, 14 insertions(+), 105 deletions(-)

diff --git a/src/core/package_manager/installer.rs b/src/core/package_manager/installer.rs
index de68c0d83..f0140b270 100644
--- a/src/core/package_manager/installer.rs
+++ b/src/core/package_manager/installer.rs
@@ -6,24 +6,6 @@ use log::{info, trace, warn};
 use std::collections::HashMap;
 use std::path::PathBuf;
 
-/// Check if the CPU supports AVX2 instructions (required for pre-built llama.cpp binaries)
-fn cpu_supports_avx2() -> bool {
-    #[cfg(target_arch = "x86_64")]
-    {
-        // Read /proc/cpuinfo on Linux to check for avx2 flag
-        if let Ok(cpuinfo) = std::fs::read_to_string("/proc/cpuinfo") {
-            return cpuinfo.contains(" avx2 ") || cpuinfo.contains(" avx2\n");
-        }
-        // Fallback: assume AVX2 is not available if we can't read cpuinfo
-        false
-    }
-    #[cfg(not(target_arch = "x86_64"))]
-    {
-        // Non-x86_64 architectures (ARM, etc.) don't use AVX2
-        false
-    }
-}
-
 #[derive(Debug)]
 pub struct PackageManager {
     pub mode: InstallMode,
@@ -217,43 +199,8 @@ impl PackageManager {
     }
 
     fn register_llm(&mut self) {
-        // Check CPU capabilities - pre-built llama.cpp binaries require AVX2
-        let has_avx2 = cpu_supports_avx2();
-
-        if !has_avx2 {
-            warn!("CPU does not support AVX2 instructions. Local LLM will not be available.");
-            warn!("To use local LLM on this CPU, you need to compile llama.cpp from source.");
-            warn!(
-                "Alternatively, configure an external LLM API (OpenAI, Anthropic, etc.) in Vault."
-            );
-            // Register a disabled LLM component that won't download or run anything
-            self.components.insert(
-                "llm".to_string(),
-                ComponentConfig {
-                    name: "llm".to_string(),
-                    ports: vec![8081, 8082],
-                    dependencies: vec![],
-                    linux_packages: vec![],
-                    macos_packages: vec![],
-                    windows_packages: vec![],
-                    download_url: None, // Don't download - CPU not compatible
-                    binary_name: None,
-                    pre_install_cmds_linux: vec![],
-                    post_install_cmds_linux: vec![],
-                    pre_install_cmds_macos: vec![],
-                    post_install_cmds_macos: vec![],
-                    pre_install_cmds_windows: vec![],
-                    post_install_cmds_windows: vec![],
-                    env_vars: HashMap::new(),
-                    data_download_list: vec![], // Don't download models
-                    exec_cmd: "echo 'LLM disabled - CPU does not support AVX2'".to_string(),
-                    check_cmd: "false".to_string(), // Always fail check - LLM not available
-                },
-            );
-            return;
-        }
-
-        info!("CPU supports AVX2 - local LLM will be available");
+        // llama.cpp is compiled from source for maximum CPU compatibility
+        // This ensures it works on older CPUs (Sandy Bridge, etc.) without AVX2
         self.components.insert(
             "llm".to_string(),
             ComponentConfig {
@@ -265,11 +212,19 @@ impl PackageManager {
                 macos_packages: vec![],
                 windows_packages: vec![],
                 download_url: Some(
-                    "https://github.com/ggml-org/llama.cpp/releases/download/b6148/llama-b6148-bin-ubuntu-x64.zip".to_string(),
+                    "https://github.com/ggml-org/llama.cpp/archive/refs/tags/b4967.zip".to_string(),
                 ),
                 binary_name: Some("llama-server".to_string()),
-                pre_install_cmds_linux: vec![],
-                post_install_cmds_linux: vec![],
+                pre_install_cmds_linux: vec![
+                    // Install build dependencies
+                    "which cmake >/dev/null 2>&1 || (sudo apt-get update && sudo apt-get install -y cmake build-essential)".to_string(),
+                ],
+                post_install_cmds_linux: vec![
+                    // Compile llama.cpp from source for this CPU's instruction set
+                    "cd {{BIN_PATH}} && if [ -d llama.cpp-b4967 ]; then mv llama.cpp-b4967/* . && rmdir llama.cpp-b4967; fi".to_string(),
+                    "cd {{BIN_PATH}} && mkdir -p build && cd build && cmake .. -DGGML_NATIVE=ON -DGGML_CPU_ALL_VARIANTS=OFF && cmake --build . --config Release -j$(nproc)".to_string(),
+                    "echo 'llama.cpp compiled successfully for this CPU'".to_string(),
+                ],
                 pre_install_cmds_macos: vec![],
                 post_install_cmds_macos: vec![],
                 pre_install_cmds_windows: vec![],
@@ -283,7 +238,7 @@ impl PackageManager {
                     // GPT-OSS 20B F16 - Recommended for small GPU (16GB VRAM), no CPU
                     // Uncomment to download: "https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/main/gpt-oss-20b-F16.gguf".to_string(),
                 ],
-                exec_cmd: "nohup {{BIN_PATH}}/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(),
+                exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(),
                 check_cmd: "curl -f -k https://localhost:8081/health >/dev/null 2>&1 && curl -f -k https://localhost:8082/health >/dev/null 2>&1".to_string(),
             },
         );
diff --git a/src/llm/local.rs b/src/llm/local.rs
index 8d26ef4cf..b8750aa5a 100644
--- a/src/llm/local.rs
+++ b/src/llm/local.rs
@@ -4,7 +4,6 @@ use crate::shared::state::AppState;
 use diesel::prelude::*;
 use log::{error, info, warn};
 use reqwest;
-use std::path::Path;
 use std::sync::Arc;
 use tokio;
 
@@ -69,51 +68,6 @@ pub async fn ensure_llama_servers_running(
     info!("  LLM Model: {}", llm_model);
     info!("  Embedding Model: {}", embedding_model);
     info!("  LLM Server Path: {}", llm_server_path);
-
-    // Check if llama-server binary exists
-    let llama_server_path = if llm_server_path.is_empty() {
-        "./botserver-stack/bin/llm/build/bin/llama-server".to_string()
-    } else {
-        format!("{}/llama-server", llm_server_path)
-    };
-
-    if !Path::new(&llama_server_path).exists() {
-        warn!("llama-server binary not found at: {}", llama_server_path);
-        warn!("Local LLM server will not be available.");
-        warn!("This may be because:");
-        warn!("  1. The LLM component was not installed (check if CPU supports AVX2)");
-        warn!("  2. The binary path is incorrect");
-        warn!("Continuing without local LLM - use external LLM API instead.");
-        return Ok(());
-    }
-
-    // Test if the binary can actually run (check for illegal instruction)
-    info!("Testing llama-server binary compatibility...");
-    let test_result = std::process::Command::new(&llama_server_path)
-        .arg("--version")
-        .output();
-
-    match test_result {
-        Ok(output) => {
-            if !output.status.success() {
-                let stderr = String::from_utf8_lossy(&output.stderr);
-                warn!("llama-server test failed: {}", stderr);
-                if stderr.contains("Illegal instruction") {
-                    error!("CPU does not support required instructions (AVX2) for llama-server");
-                    error!("Your CPU: Check /proc/cpuinfo for 'avx2' flag");
-                    error!("Options:");
-                    error!("  1. Compile llama.cpp from source with your CPU's instruction set");
-                    error!("  2. Use an external LLM API (OpenAI, Anthropic, etc.)");
-                    return Ok(());
-                }
-            }
-        }
-        Err(e) => {
-            warn!("Failed to test llama-server: {}", e);
-            // Continue anyway - might work at runtime
-        }
-    }
-
     info!("Restarting any existing llama-server processes...");
 
     if let Err(e) = tokio::process::Command::new("sh")