From fd9e14e43a4d7de2b5b897cc4160ae9c56137713 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Wed, 12 Nov 2025 18:19:23 -0300 Subject: [PATCH] feat(llm): add flash-attn flag to local server args Added the `--flash-attn on` flag to the LLM server startup arguments to enable flash attention optimization. This improves performance while maintaining existing parameters (top_p, temp, repeat-penalty). A TODO was added to move these parameters to config for better maintainability. --- src/llm/local.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llm/local.rs b/src/llm/local.rs index cf49236a..76750bac 100644 --- a/src/llm/local.rs +++ b/src/llm/local.rs @@ -233,9 +233,10 @@ pub async fn start_llm_server( .get_config(&default_bot_id, "llm-server-ctx-size", None) .unwrap_or("4096".to_string()); + // TODO: Move flash-attn, temp, top_p, repeat-penalty to config as well. let mut args = format!( - "-m {} --host 0.0.0.0 --port {} --top_p 0.95 --temp 0.6 --repeat-penalty 1.2 --n-gpu-layers {}", + "-m {} --host 0.0.0.0 --port {} --top_p 0.95 --flash-attn on --temp 0.6 --repeat-penalty 1.2 --n-gpu-layers {}", model_path, port, gpu_layers ); if !reasoning_format.is_empty() {