feat(llm): add flash-attn flag to local server args
Added the `--flash-attn on` flag to the LLM server startup arguments to enable flash attention optimization. This improves performance while maintaining existing parameters (top_p, temp, repeat-penalty). A TODO was added to move these parameters to config for better maintainability.
This commit is contained in:
parent
a024f9590c
commit
fd9e14e43a
1 changed files with 2 additions and 1 deletions
|
|
@ -233,9 +233,10 @@ pub async fn start_llm_server(
|
||||||
.get_config(&default_bot_id, "llm-server-ctx-size", None)
|
.get_config(&default_bot_id, "llm-server-ctx-size", None)
|
||||||
.unwrap_or("4096".to_string());
|
.unwrap_or("4096".to_string());
|
||||||
|
|
||||||
|
// TODO: Move flash-attn, temp, top_p, repeat-penalty to config as well.
|
||||||
|
|
||||||
let mut args = format!(
|
let mut args = format!(
|
||||||
"-m {} --host 0.0.0.0 --port {} --top_p 0.95 --temp 0.6 --repeat-penalty 1.2 --n-gpu-layers {}",
|
"-m {} --host 0.0.0.0 --port {} --top_p 0.95 --flash-attn on --temp 0.6 --repeat-penalty 1.2 --n-gpu-layers {}",
|
||||||
model_path, port, gpu_layers
|
model_path, port, gpu_layers
|
||||||
);
|
);
|
||||||
if !reasoning_format.is_empty() {
|
if !reasoning_format.is_empty() {
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue