From 87df733db070858d499371ce0a39ad3767894e1c Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Mon, 13 Apr 2026 18:33:16 -0300 Subject: [PATCH] fix: GLM client - add chat_template_kwargs, handle reasoning_content, increase max_tokens to 16384 --- src/llm/glm.rs | 166 +++++++++++++++++++++++++++---------------------- 1 file changed, 92 insertions(+), 74 deletions(-) diff --git a/src/llm/glm.rs b/src/llm/glm.rs index e6aa7bb9..321c8b1d 100644 --- a/src/llm/glm.rs +++ b/src/llm/glm.rs @@ -1,16 +1,12 @@ use async_trait::async_trait; use futures::StreamExt; -use log::{error, info}; +use log::{error, info, trace}; use serde::{Deserialize, Serialize}; use serde_json::Value; use tokio::sync::mpsc; use super::LLMProvider; -// GLM / z.ai API Client -// Similar to OpenAI but with different endpoint structure -// For z.ai, base URL already contains version (e.g., /v4), endpoint is just /chat/completions - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct GLMMessage { pub role: String, @@ -20,6 +16,12 @@ pub struct GLMMessage { pub tool_calls: Option>, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GLMChatTemplateKwargs { + pub enable_thinking: bool, + pub clear_thinking: bool, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct GLMRequest { pub model: String, @@ -36,6 +38,8 @@ pub struct GLMRequest { pub tools: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub tool_choice: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub chat_template_kwargs: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -58,7 +62,6 @@ pub struct GLMResponse { pub usage: Option, } -// Streaming structures #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct GLMStreamDelta { #[serde(default)] @@ -75,7 +78,6 @@ pub struct GLMStreamDelta { pub struct GLMStreamChoice { #[serde(default)] pub index: u32, - #[serde(default)] pub delta: GLMStreamDelta, #[serde(default)] pub finish_reason: Option, @@ -116,7 +118,6 @@ impl GLMClient { } } - /// Sanitizes a string by removing invalid UTF-8 surrogate characters fn sanitize_utf8(input: &str) -> String { input.chars() .filter(|c| { @@ -142,26 +143,29 @@ impl LLMProvider for GLMClient { tool_calls: None, }]; - // NVIDIA API uses z-ai/glm4.7 as the model identifier let model_name = if model == "glm-4" || model == "glm-4.7" { "z-ai/glm4.7" } else { model }; - let request = GLMRequest { - model: model_name.to_string(), - messages, - stream: Some(false), - max_tokens: None, - temperature: Some(1.0), - top_p: Some(1.0), - tools: None, - tool_choice: None, - }; + let request = GLMRequest { + model: model_name.to_string(), + messages, + stream: Some(false), + max_tokens: Some(16384), + temperature: Some(1.0), + top_p: Some(1.0), + tools: None, + tool_choice: None, + chat_template_kwargs: Some(GLMChatTemplateKwargs { + enable_thinking: true, + clear_thinking: false, + }), + }; let url = self.build_url(); - info!("GLM non-streaming request to: {}", url); + info!("[GLM] Non-streaming request to: {}", url); let response = self .client @@ -174,7 +178,7 @@ impl LLMProvider for GLMClient { if !response.status().is_success() { let error_text = response.text().await.unwrap_or_default(); - error!("GLM API error: {}", error_text); + error!("[GLM] API error: {}", error_text); return Err(format!("GLM API error: {}", error_text).into()); } @@ -197,15 +201,12 @@ impl LLMProvider for GLMClient { key: &str, tools: Option<&Vec>, ) -> Result<(), Box> { - // config IS the messages array directly, not nested let messages = if let Some(msgs) = config.as_array() { - // Convert messages from config format to GLM format msgs.iter() .filter_map(|m| { let role = m.get("role")?.as_str()?; let content = m.get("content")?.as_str()?; let sanitized = Self::sanitize_utf8(content); - // NVIDIA API accepts empty content, don't filter them out Some(GLMMessage { role: role.to_string(), content: Some(sanitized), @@ -214,7 +215,6 @@ impl LLMProvider for GLMClient { }) .collect::>() } else { - // Fallback to building from prompt vec![GLMMessage { role: "user".to_string(), content: Some(Self::sanitize_utf8(prompt)), @@ -222,39 +222,39 @@ impl LLMProvider for GLMClient { }] }; - // If no messages, return error if messages.is_empty() { return Err("No valid messages in request".into()); } - // NVIDIA API uses z-ai/glm4.7 as the model identifier - // GLM-4.7 supports standard OpenAI-compatible function calling let model_name = if model == "glm-4" || model == "glm-4.7" { "z-ai/glm4.7" } else { model }; - // Set tool_choice to "auto" when tools are present - this tells GLM to automatically decide when to call a tool let tool_choice = if tools.is_some() { Some(serde_json::json!("auto")) } else { None }; - let request = GLMRequest { - model: model_name.to_string(), - messages, - stream: Some(true), - max_tokens: None, - temperature: Some(1.0), - top_p: Some(1.0), - tools: tools.cloned(), - tool_choice, - }; + let request = GLMRequest { + model: model_name.to_string(), + messages, + stream: Some(true), + max_tokens: Some(16384), + temperature: Some(1.0), + top_p: Some(1.0), + tools: tools.cloned(), + tool_choice, + chat_template_kwargs: Some(GLMChatTemplateKwargs { + enable_thinking: true, + clear_thinking: false, + }), + }; let url = self.build_url(); - info!("GLM streaming request to: {}", url); + info!("[GLM] Streaming request to: {}", url); let response = self .client @@ -267,29 +267,28 @@ impl LLMProvider for GLMClient { if !response.status().is_success() { let error_text = response.text().await.unwrap_or_default(); - error!("GLM streaming error: {}", error_text); + error!("[GLM] Streaming error: {}", error_text); return Err(format!("GLM streaming error: {}", error_text).into()); } let mut stream = response.bytes_stream(); - + let mut in_reasoning = false; + let mut has_sent_thinking = false; let mut buffer = Vec::new(); + while let Some(chunk_result) = stream.next().await { let chunk = chunk_result.map_err(|e| format!("Stream error: {}", e))?; - buffer.extend_from_slice(&chunk); let data = String::from_utf8_lossy(&buffer); - // Process SSE lines for line in data.lines() { let line = line.trim(); - if line.is_empty() { continue; } if line == "data: [DONE]" { - std::mem::drop(tx.send(String::new())); // Signal end + let _ = tx.send(String::new()).await; return Ok(()); } @@ -299,43 +298,64 @@ impl LLMProvider for GLMClient { if let Some(choices) = chunk_data.get("choices").and_then(|c| c.as_array()) { for choice in choices { if let Some(delta) = choice.get("delta") { - // Handle tool_calls (GLM-4.7 standard function calling) + // Handle tool_calls if let Some(tool_calls) = delta.get("tool_calls").and_then(|t| t.as_array()) { for tool_call in tool_calls { - // Send tool_calls as JSON for the calling code to process let tool_call_json = serde_json::json!({ "type": "tool_call", "content": tool_call }).to_string(); - match tx.send(tool_call_json).await { - Ok(_) => {}, - Err(e) => { - error!("Failed to send tool_call to channel: {}", e); - } - } + let _ = tx.send(tool_call_json).await; } } - // GLM-4.7 on NVIDIA sends thinking text via reasoning_content - // The actual user-facing response is in content field - // We ONLY send content — never reasoning_content (internal thinking) - if let Some(content) = delta.get("content").and_then(|c| c.as_str()) { - if !content.is_empty() { - match tx.send(content.to_string()).await { - Ok(_) => {}, - Err(e) => { - error!("Failed to send to channel: {}", e); - } - } + // Handle reasoning_content (thinking phase) + let reasoning = delta.get("reasoning_content") + .and_then(|r| r.as_str()) + .or_else(|| delta.get("reasoning").and_then(|r| r.as_str())); + + let content = delta.get("content").and_then(|c| c.as_str()); + + // Enter reasoning mode + if reasoning.is_some() && content.is_none() { + if !in_reasoning { + trace!("[GLM] Entering reasoning/thinking mode"); + in_reasoning = true; + } + if !has_sent_thinking { + let thinking = serde_json::json!({ + "type": "thinking", + "content": "\u{1f914} Pensando..." + }).to_string(); + let _ = tx.send(thinking).await; + has_sent_thinking = true; + } + continue; + } + + // Exited reasoning — content is now real response + if in_reasoning && content.is_some() { + trace!("[GLM] Exited reasoning mode"); + in_reasoning = false; + let clear = serde_json::json!({ + "type": "thinking_clear", + "content": "" + }).to_string(); + let _ = tx.send(clear).await; + } + + // Send actual content to user + if let Some(text) = content { + if !text.is_empty() { + let _ = tx.send(text.to_string()).await; } } - } else { - // No delta in choice } + if let Some(reason) = choice.get("finish_reason").and_then(|r| r.as_str()) { if !reason.is_empty() { - info!("GLM stream finished: {}", reason); - std::mem::drop(tx.send(String::new())); + info!("[GLM] Stream finished: {}", reason); + let _ = tx.send(String::new()).await; return Ok(()); } } @@ -345,13 +365,12 @@ impl LLMProvider for GLMClient { } } - // Keep unprocessed data in buffer if let Some(last_newline) = data.rfind('\n') { buffer = buffer[last_newline + 1..].to_vec(); } } - std::mem::drop(tx.send(String::new())); // Signal completion + let _ = tx.send(String::new()).await; Ok(()) } @@ -359,8 +378,7 @@ impl LLMProvider for GLMClient { &self, _session_id: &str, ) -> Result<(), Box> { - // GLM doesn't have job cancellation - info!("GLM cancel requested for session {} (no-op)", _session_id); + info!("[GLM] Cancel requested for session {} (no-op)", _session_id); Ok(()) } -} +} \ No newline at end of file