fix: Kimi K2.5 factory + LLM chunk traces
All checks were successful
BotServer CI/CD / build (push) Successful in 4m35s

- Kimi factory: add max_tokens=16384, temperature=1.0, top_p=1.0,
  and chat_template_kwargs.thinking=true for kimi models
- Add chunk count traces in stream_response so we see LLM progress
  immediately in logs: 'LLM chunk #N received (len=X)'
- Keep generic stream parser clean — model-specific logic lives in
  the request builder (Kimi factory pattern)

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2026-04-14 10:20:02 -03:00
parent 03f060680e
commit 679bf05504
2 changed files with 16 additions and 1 deletions

View file

@ -836,7 +836,9 @@ impl BotOrchestrator {
let _handler = llm_models::get_handler(&model);
trace!("Using model handler for {}", model);
info!("LLM streaming started for session {}", session.id);
trace!("Receiving LLM stream chunks...");
let mut chunk_count: usize = 0;
#[cfg(feature = "nvidia")]
{
@ -860,6 +862,10 @@ impl BotOrchestrator {
}
while let Some(chunk) = stream_rx.recv().await {
chunk_count += 1;
if chunk_count <= 3 || chunk_count % 50 == 0 {
info!("LLM chunk #{chunk_count} received for session {} (len={})", session.id, chunk.len());
}
// ===== GENERIC TOOL EXECUTION =====
// Add chunk to tool_call_buffer and try to parse

View file

@ -382,9 +382,18 @@ impl LLMProvider for OpenAIClient {
let mut request_body = serde_json::json!({
"model": model,
"messages": messages,
"stream": true
"stream": true,
"max_tokens": 16384,
"temperature": 1.0,
"top_p": 1.0
});
// Kimi K2.5 factory: enable thinking mode via chat_template_kwargs
if model.contains("kimi") {
request_body["chat_template_kwargs"] = serde_json::json!({"thinking": true});
info!("Kimi factory: enabled thinking mode (chat_template_kwargs)");
}
// Add tools to the request if provided
if let Some(tools_value) = tools {
if !tools_value.is_empty() {