From cd0e049e81f8c3609393f1f4bbb6cc6898eaa2de Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Sun, 12 Apr 2026 08:21:39 -0300 Subject: [PATCH] Reduce embedding batch_size from 16 to 2 to prevent llama-server crash The bge-small-en-v1.5-f32.gguf model has n_ctx_train=512. With batch_size=16 and ~300+ tokens per chunk, total tokens exceed 512 causing GGML_ASSERT crash. Now with batch_size=2, embeddings are processed safely. --- src/core/kb/embedding_generator.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/kb/embedding_generator.rs b/src/core/kb/embedding_generator.rs index ae4cdad4..2c149a90 100644 --- a/src/core/kb/embedding_generator.rs +++ b/src/core/kb/embedding_generator.rs @@ -43,7 +43,7 @@ impl Default for EmbeddingConfig { embedding_model: "BAAI/bge-multilingual-gemma2".to_string(), embedding_key: None, dimensions: 2048, - batch_size: 16, + batch_size: 2, timeout_seconds: 60, max_concurrent_requests: 1, connect_timeout_seconds: 10,