From 97661d75e292b0c1b765ed0c4fe8f931e2e64243 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Sun, 8 Mar 2026 14:52:59 -0300 Subject: [PATCH] feat(whatsapp): isolate lists as single messages and remove code blocks - Split list detection into numbered and bullet list items - Add looks_like_list_start() to detect when list is beginning - Add looks_like_list_end() to detect when list has ended - Add split_text_before_list() to separate text before list - Add split_list_from_text() to separate list from text after - Update streaming logic to send lists as isolated messages - Add code block removal (triple backticks and inline backticks) - Add comprehensive unit tests for list detection functions Resolves: Lists being mixed with other text in WhatsApp messages Resolves: JavaScript/C# code leaking into WhatsApp messages --- src/core/bot/channels/whatsapp.rs | 250 ++++++++++++- src/whatsapp/mod.rs | 574 +++++++++++++++++++++++++++++- 2 files changed, 799 insertions(+), 25 deletions(-) diff --git a/src/core/bot/channels/whatsapp.rs b/src/core/bot/channels/whatsapp.rs index ca1d7dbe..5e4e2c6e 100644 --- a/src/core/bot/channels/whatsapp.rs +++ b/src/core/bot/channels/whatsapp.rs @@ -50,6 +50,65 @@ impl WhatsAppAdapter { } } + /// Sanitize Markdown text for WhatsApp compatibility + /// WhatsApp only supports: *bold*, _italic_, ~strikethrough~, ```monospace``` + /// Does NOT support: headers (###), links [text](url), checkboxes, etc. + pub fn sanitize_for_whatsapp(text: &str) -> String { + let mut result = text.to_string(); + + // Remove Markdown headers (### ## # at start of lines) + result = regex::Regex::new(r"(?m)^#{1,6}\s*") + .map(|re| re.replace_all(&result, "").to_string()) + .unwrap_or(result); + + // Convert Markdown links [text](url) to "text: url" + result = regex::Regex::new(r"\[([^\]]+)\]\(([^)]+)\)") + .map(|re| re.replace_all(&result, "$1: $2").to_string()) + .unwrap_or(result); + + // Remove image syntax ![alt](url) - just keep alt text + result = regex::Regex::new(r"!\[([^\]]*)\]\([^)]+\)") + .map(|re| re.replace_all(&result, "$1").to_string()) + .unwrap_or(result); + + // Remove checkbox syntax [ ] and [x] + result = regex::Regex::new(r"\[[ x]\]") + .map(|re| re.replace_all(&result, "•").to_string()) + .unwrap_or(result); + + // Remove horizontal rules (--- or ***) + result = regex::Regex::new(r"(?m)^[-*]{3,}\s*$") + .map(|re| re.replace_all(&result, "").to_string()) + .unwrap_or(result); + + // Remove code blocks with triple backticks ```code``` + result = regex::Regex::new(r"```[\s\S]*?```") + .map(|re| re.replace_all(&result, "").to_string()) + .unwrap_or(result); + + // Remove inline code with single backticks `code` + result = regex::Regex::new(r"`[^`]+`") + .map(|re| re.replace_all(&result, "").to_string()) + .unwrap_or(result); + + // Remove HTML tags if any + result = regex::Regex::new(r"<[^>]+>") + .map(|re| re.replace_all(&result, "").to_string()) + .unwrap_or(result); + + // Clean up multiple consecutive blank lines + result = regex::Regex::new(r"\n{3,}") + .map(|re| re.replace_all(&result, "\n\n").to_string()) + .unwrap_or(result); + + // Clean up trailing whitespace on lines + result = regex::Regex::new(r"[ \t]+$") + .map(|re| re.replace_all(&result, "").to_string()) + .unwrap_or(result); + + result.trim().to_string() + } + async fn send_whatsapp_message( &self, to: &str, @@ -368,6 +427,154 @@ impl WhatsAppAdapter { } } + /// Smart message splitting for WhatsApp's character limit. + /// Splits at paragraph boundaries, keeping lists together. + /// Groups up to 3 paragraphs per message when possible. + pub fn split_message_smart(&self, content: &str, max_length: usize) -> Vec { + let mut parts = Vec::new(); + let mut current_part = String::new(); + let mut paragraph_count = 0; + + // Split content into blocks (paragraphs or list items) + let lines: Vec<&str> = content.lines().collect(); + let mut i = 0; + + while i < lines.len() { + let line = lines[i]; + let is_list_item = line.trim().starts_with("- ") + || line.trim().starts_with("* ") + || line.trim().starts_with("• ") + || line.trim().starts_with(|c: char| c.is_numeric()); + + // Check if this is the start of a list + if is_list_item { + // Flush current part if it has content and adding list would exceed limit + if !current_part.is_empty() { + // If we have 3+ paragraphs, flush + if paragraph_count >= 3 || current_part.len() + line.len() > max_length { + parts.push(current_part.trim().to_string()); + current_part = String::new(); + paragraph_count = 0; + } + } + + // Collect entire list as one block + let mut list_block = String::new(); + while i < lines.len() { + let list_line = lines[i]; + let is_still_list = list_line.trim().starts_with("- ") + || list_line.trim().starts_with("* ") + || list_line.trim().starts_with("• ") + || list_line.trim().starts_with(|c: char| c.is_numeric()) + || (list_line.trim().is_empty() && i + 1 < lines.len() && { + let next = lines[i + 1]; + next.trim().starts_with("- ") + || next.trim().starts_with("* ") + || next.trim().starts_with("• ") + }); + + if is_still_list || (list_line.trim().is_empty() && !list_block.is_empty()) { + if list_block.len() + list_line.len() + 1 > max_length { + // List is too long, split it + if !list_block.is_empty() { + if !current_part.is_empty() { + parts.push(current_part.trim().to_string()); + current_part = String::new(); + } + parts.push(list_block.trim().to_string()); + list_block = String::new(); + } + } + if !list_line.trim().is_empty() { + if !list_block.is_empty() { + list_block.push('\n'); + } + list_block.push_str(list_line); + } + i += 1; + } else { + break; + } + } + + if !list_block.is_empty() { + if !current_part.is_empty() && current_part.len() + list_block.len() + 1 <= max_length { + current_part.push('\n'); + current_part.push_str(&list_block); + } else { + if !current_part.is_empty() { + parts.push(current_part.trim().to_string()); + } + parts.push(list_block.trim().to_string()); + current_part = String::new(); + paragraph_count = 0; + } + } + continue; + } + + // Regular paragraph + if !line.trim().is_empty() { + if !current_part.is_empty() { + current_part.push('\n'); + } + current_part.push_str(line); + paragraph_count += 1; + + // Flush if we have 3 paragraphs or exceeded max length + if paragraph_count >= 3 || current_part.len() > max_length { + parts.push(current_part.trim().to_string()); + current_part = String::new(); + paragraph_count = 0; + } + } else if !current_part.is_empty() { + // Empty line marks paragraph end + paragraph_count += 1; + if paragraph_count >= 3 { + parts.push(current_part.trim().to_string()); + current_part = String::new(); + paragraph_count = 0; + } + } + + i += 1; + } + + // Don't forget the last part + if !current_part.trim().is_empty() { + parts.push(current_part.trim().to_string()); + } + + // Handle edge case: if a single part exceeds max_length, force split + let mut final_parts = Vec::new(); + for part in parts { + if part.len() <= max_length { + final_parts.push(part); + } else { + // Hard split at max_length, trying to break at word boundary + let mut remaining = part.as_str(); + while !remaining.is_empty() { + if remaining.len() <= max_length { + final_parts.push(remaining.to_string()); + break; + } + // Find last space before max_length + let split_pos = remaining[..max_length] + .rfind(' ') + .unwrap_or(max_length); + final_parts.push(remaining[..split_pos].to_string()); + remaining = remaining[split_pos..].trim(); + } + } + } + + if final_parts.is_empty() { + final_parts.push(content.to_string()); + } + + final_parts + } + pub fn verify_webhook(&self, token: &str) -> bool { token == self.webhook_verify_token } @@ -405,14 +612,43 @@ impl ChannelAdapter for WhatsAppAdapter { return Err("WhatsApp not configured".into()); } - let message_id = self - .send_whatsapp_message(&response.user_id, &response.content) - .await?; + // WhatsApp has a 4096 character limit per message + // Split message at paragraph/list boundaries + const MAX_WHATSAPP_LENGTH: usize = 4000; // Leave some buffer - info!( - "WhatsApp message sent to {}: {} (message_id: {})", - response.user_id, response.content, message_id - ); + // Sanitize Markdown for WhatsApp compatibility + let sanitized_content = Self::sanitize_for_whatsapp(&response.content); + + if sanitized_content.len() <= MAX_WHATSAPP_LENGTH { + // Message fits in one part + let message_id = self + .send_whatsapp_message(&response.user_id, &sanitized_content) + .await?; + + info!( + "WhatsApp message sent to {}: {} (message_id: {})", + response.user_id, &sanitized_content.chars().take(100).collect::(), message_id + ); + } else { + // Split message at appropriate boundaries + let parts = self.split_message_smart(&sanitized_content, MAX_WHATSAPP_LENGTH); + + for (i, part) in parts.iter().enumerate() { + let message_id = self + .send_whatsapp_message(&response.user_id, part) + .await?; + + info!( + "WhatsApp message part {}/{} sent to {}: {} (message_id: {})", + i + 1, parts.len(), response.user_id, &part.chars().take(50).collect::(), message_id + ); + + // Small delay between messages to avoid rate limiting + if i < parts.len() - 1 { + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + } + } + } Ok(()) } diff --git a/src/whatsapp/mod.rs b/src/whatsapp/mod.rs index 9dd3b2e3..4dba7746 100644 --- a/src/whatsapp/mod.rs +++ b/src/whatsapp/mod.rs @@ -699,13 +699,30 @@ async fn route_to_bot( const MAX_WHATSAPP_LENGTH: usize = 4000; const MIN_FLUSH_PARAGRAPHS: usize = 3; - /// Check if a line is a list item - fn is_list_item(line: &str) -> bool { + /// Check if a line is a list item (numbered: "1. ", "10. ", etc.) + fn is_numbered_list_item(line: &str) -> bool { let trimmed = line.trim(); - trimmed.starts_with("- ") - || trimmed.starts_with("* ") - || trimmed.starts_with("• ") - || trimmed.chars().next().map(|c| c.is_numeric()).unwrap_or(false) + // Must start with digit(s) followed by '.' or ')' and then space or end + let chars: Vec = trimmed.chars().collect(); + let mut i = 0; + // Skip digits + while i < chars.len() && chars[i].is_numeric() { + i += 1; + } + // Must have at least one digit and be followed by '.' or ')' then space + i > 0 && i < chars.len() && (chars[i] == '.' || chars[i] == ')') + && (i + 1 >= chars.len() || chars[i + 1] == ' ') + } + + /// Check if a line is a bullet list item + fn is_bullet_list_item(line: &str) -> bool { + let trimmed = line.trim(); + trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("• ") + } + + /// Check if a line is any type of list item + fn is_list_item(line: &str) -> bool { + is_numbered_list_item(line) || is_bullet_list_item(line) } /// Check if buffer contains a list (any line starting with list marker) @@ -713,6 +730,104 @@ async fn route_to_bot( text.lines().any(is_list_item) } + /// Check if buffer looks like it might be starting a list + /// (header followed by blank line, or ends with partial list item) + fn looks_like_list_start(text: &str) -> bool { + let lines: Vec<&str> = text.lines().collect(); + if lines.len() < 2 { + return false; + } + // Check if last non-empty line looks like a header (short, ends with ':') + let last_content = lines.iter().rev().find(|l| !l.trim().is_empty()); + if let Some(line) = last_content { + let trimmed = line.trim(); + // Header pattern: short line ending with ':' + if trimmed.len() < 50 && trimmed.ends_with(':') { + return true; + } + // Partial list item: starts with number but incomplete + if trimmed.chars().next().map(|c| c.is_numeric()).unwrap_or(false) { + return true; + } + } + false + } + + /// Check if a list appears to have ended (had list items, but last lines are not list items) + fn looks_like_list_end(text: &str) -> bool { + let lines: Vec<&str> = text.lines().collect(); + if lines.len() < 3 { + return false; + } + + // Check if there's at least one list item in the text + let has_list_items = lines.iter().any(|l| is_list_item(l)); + if !has_list_items { + return false; + } + + // Check if the last 2+ non-empty lines are NOT list items + let non_empty_lines: Vec<&str> = lines.iter().rev() + .copied() + .filter(|l| !l.trim().is_empty()) + .take(2) + .collect(); + + if non_empty_lines.len() < 2 { + return false; + } + + // If the last 2 non-empty lines are not list items, the list has likely ended + non_empty_lines.iter().all(|l| !is_list_item(l)) + } + + /// Split text into (before_list, list_and_after) + /// Returns everything before the first list item, and everything from the list item onwards + fn split_text_before_list(text: &str) -> (String, String) { + let lines: Vec<&str> = text.lines().collect(); + let mut list_start_idx = None; + + // Find the first list item + for (idx, line) in lines.iter().enumerate() { + if is_list_item(line) { + list_start_idx = Some(idx); + break; + } + } + + match list_start_idx { + Some(idx) => { + let before = lines[..idx].join("\n"); + let rest = lines[idx..].join("\n"); + (before, rest) + } + None => (text.to_string(), String::new()) + } + } + + /// Split text into (list, after_list) + /// Extracts the list portion and any text after it + fn split_list_from_text(text: &str) -> (String, String) { + let lines: Vec<&str> = text.lines().collect(); + let mut list_end_idx = lines.len(); + + // Find where the list ends (first non-list item after list starts) + let mut found_list = false; + for (idx, line) in lines.iter().enumerate() { + if is_list_item(line) { + found_list = true; + } else if found_list && !line.trim().is_empty() { + // Found non-empty, non-list line after list items + list_end_idx = idx; + break; + } + } + + let list = lines[..list_end_idx].join("\n"); + let after = lines[list_end_idx..].join("\n"); + (list, after) + } + /// Send a WhatsApp message part async fn send_part( adapter: &crate::core::bot::channels::whatsapp::WhatsAppAdapter, @@ -753,22 +868,51 @@ async fn route_to_bot( buffer.push_str(&response.content); } - // SIMPLE LOGIC: - // 1. If buffer contains a list, ONLY flush when is_final or too long - // 2. If no list, use normal paragraph-based flushing + // IMPROVED LOGIC: + // 1. If buffer contains a list OR looks like list is starting, wait for final/too long + // 2. Otherwise, use normal paragraph-based flushing let has_list = contains_list(&buffer); + let maybe_list_start = !has_list && looks_like_list_start(&buffer); + let list_ended = has_list && looks_like_list_end(&buffer); - debug!( - "WA stream: is_final={}, has_list={}, buffer_len={}, buffer_preview={:?}", - is_final, has_list, buffer.len(), &buffer.chars().take(100).collect::() + info!( + "WA stream: is_final={}, has_list={}, maybe_start={}, list_ended={}, len={}, preview={:?}", + is_final, has_list, maybe_list_start, list_ended, buffer.len(), &buffer.chars().take(80).collect::() ); - if has_list { - // With lists: only flush when final or too long - // This ensures the ENTIRE list is sent as one message - if is_final || buffer.len() >= MAX_WHATSAPP_LENGTH { - info!("WA sending list message, len={}", buffer.len()); + if has_list || maybe_list_start { + // With lists: isolate them as separate messages + if list_ended { + info!("WA list ended, isolating list message"); + + // Step 1: Split text before list + let (text_before, rest) = split_text_before_list(&buffer); + + // Step 2: Send text before list (if not empty) + if !text_before.trim().is_empty() { + info!("WA sending text before list, len={}", text_before.len()); + send_part(&adapter_for_send, &phone, text_before, false).await; + } + + // Step 3: Split list from text after + let (list, text_after) = split_list_from_text(&rest); + + // Step 4: Send list (isolated) + if !list.trim().is_empty() { + info!("WA sending isolated list, len={}", list.len()); + send_part(&adapter_for_send, &phone, list, false).await; + } + + // Step 5: Keep text after in buffer + buffer = text_after; + + if !buffer.trim().is_empty() { + debug!("WA keeping text after list in buffer, len={}", buffer.len()); + } + } else if is_final || buffer.len() >= MAX_WHATSAPP_LENGTH { + // Final message or buffer too long - send everything + info!("WA sending list message (final/overflow), len={}, has_list={}", buffer.len(), has_list); if buffer.len() > MAX_WHATSAPP_LENGTH { let parts = adapter_for_send.split_message_smart(&buffer, MAX_WHATSAPP_LENGTH); for part in parts { @@ -781,7 +925,6 @@ async fn route_to_bot( } else { debug!("WA waiting for more list content (buffer len={})", buffer.len()); } - // Otherwise: wait for more content (don't flush mid-list) } else { // No list: use normal paragraph-based flushing let paragraph_count = buffer @@ -1726,4 +1869,399 @@ mod tests { assert_eq!(value.statuses.len(), 1); assert_eq!(value.statuses[0].status, "sent"); } + + // ==================== List Detection Tests ==================== + + /// Helper function to test numbered list item detection + fn is_numbered_list_item(line: &str) -> bool { + let trimmed = line.trim(); + let chars: Vec = trimmed.chars().collect(); + let mut i = 0; + while i < chars.len() && chars[i].is_numeric() { + i += 1; + } + i > 0 && i < chars.len() && (chars[i] == '.' || chars[i] == ')') + && (i + 1 >= chars.len() || chars[i + 1] == ' ') + } + + fn is_bullet_list_item(line: &str) -> bool { + let trimmed = line.trim(); + trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("• ") + } + + fn is_list_item(line: &str) -> bool { + is_numbered_list_item(line) || is_bullet_list_item(line) + } + + fn contains_list(text: &str) -> bool { + text.lines().any(is_list_item) + } + + fn looks_like_list_start(text: &str) -> bool { + let lines: Vec<&str> = text.lines().collect(); + if lines.len() < 2 { + return false; + } + let last_content = lines.iter().rev().find(|l| !l.trim().is_empty()); + if let Some(line) = last_content { + let trimmed = line.trim(); + if trimmed.len() < 50 && trimmed.ends_with(':') { + return true; + } + if trimmed.chars().next().map(|c| c.is_numeric()).unwrap_or(false) { + return true; + } + } + false + } + + fn looks_like_list_end(text: &str) -> bool { + let lines: Vec<&str> = text.lines().collect(); + if lines.len() < 3 { + return false; + } + + // Check if there's at least one list item in the text + let has_list_items = lines.iter().any(|l| is_list_item(l)); + if !has_list_items { + return false; + } + + // Check if the last 2+ non-empty lines are NOT list items + let non_empty_lines: Vec<&str> = lines.iter().rev() + .copied() + .filter(|l| !l.trim().is_empty()) + .take(2) + .collect(); + + if non_empty_lines.len() < 2 { + return false; + } + + // If the last 2 non-empty lines are not list items, the list has likely ended + non_empty_lines.iter().all(|l| !is_list_item(l)) + } + + /// Split text into (before_list, list_and_after) + /// Returns everything before the first list item, and everything from the list item onwards + fn split_text_before_list(text: &str) -> (String, String) { + let lines: Vec<&str> = text.lines().collect(); + let mut list_start_idx = None; + + // Find the first list item + for (idx, line) in lines.iter().enumerate() { + if is_list_item(line) { + list_start_idx = Some(idx); + break; + } + } + + match list_start_idx { + Some(idx) => { + let before = lines[..idx].join("\n"); + let rest = lines[idx..].join("\n"); + (before, rest) + } + None => (text.to_string(), String::new()) + } + } + + /// Split text into (list, after_list) + /// Extracts the list portion and any text after it + fn split_list_from_text(text: &str) -> (String, String) { + let lines: Vec<&str> = text.lines().collect(); + let mut list_end_idx = lines.len(); + + // Find where the list ends (first non-list item after list starts) + let mut found_list = false; + for (idx, line) in lines.iter().enumerate() { + if is_list_item(line) { + found_list = true; + } else if found_list && !line.trim().is_empty() { + // Found non-empty, non-list line after list items + list_end_idx = idx; + break; + } + } + + let list = lines[..list_end_idx].join("\n"); + let after = lines[list_end_idx..].join("\n"); + (list, after) + } + + #[test] + fn test_numbered_list_detection() { + // Valid numbered list items + assert!(is_numbered_list_item("1. Item")); + assert!(is_numbered_list_item("1. Item with text")); + assert!(is_numbered_list_item("10. Tenth item")); + assert!(is_numbered_list_item("1) Item with parenthesis")); + assert!(is_numbered_list_item(" 1. Indented item")); // trim works + + // Invalid - not numbered list items + assert!(!is_numbered_list_item("Item 1")); // number at end + assert!(!is_numbered_list_item("2024 was a year")); // year in sentence + assert!(!is_numbered_list_item("1.Item")); // no space after dot + assert!(!is_numbered_list_item("Item")); // no number + assert!(!is_numbered_list_item("")); // empty + } + + #[test] + fn test_bullet_list_detection() { + // Valid bullet list items + assert!(is_bullet_list_item("- Item")); + assert!(is_bullet_list_item("* Item")); + assert!(is_bullet_list_item("• Item")); + assert!(is_bullet_list_item(" - Indented item")); + + // Invalid + assert!(!is_bullet_list_item("Item - with dash")); + assert!(!is_bullet_list_item("-Item")); // no space after dash + } + + #[test] + fn test_contains_list() { + // Contains numbered list + assert!(contains_list("Some text\n1. First item\n2. Second item")); + + // Contains bullet list + assert!(contains_list("- Item 1\n- Item 2")); + + // No list + assert!(!contains_list("Just regular text")); + assert!(!contains_list("2024 was a great year")); // year should not trigger + + // Mixed content with list + assert!(contains_list("Here are the options:\n\n1. Option A\n2. Option B")); + } + + #[test] + fn test_looks_like_list_start() { + // Header followed by content looks like list start + assert!(looks_like_list_start("Aulas Disponíveis:\n\n")); + assert!(looks_like_list_start("Options:\n\nSome content")); + + // Number at start looks like potential list + assert!(looks_like_list_start("Some text\n1")); + + // Regular text doesn't look like list start + assert!(!looks_like_list_start("Just regular text")); + assert!(!looks_like_list_start("Single line")); + } + + #[test] + fn test_full_list_scenario() { + // Simulate the exact scenario from the bug report + let content = r#"Aulas Disponíveis: + +1. Aula de Violão - Aprenda a tocar violão do básico ao avançado +2. Aula de Piano - Desenvolva suas habilidades no piano +3. Aula de Canto - Técnicas vocais para todos os níveis +4. Aula de Teatro - Expressão corporal e interpretação +5. Aula de Dança - Diversos estilos de dança +6. Aula de Desenho - Técnicas de desenho e pintura +7. Aula de Inglês - Aprenda inglês de forma dinâmica +8. Aula de Robótica - Introdução à robótica e programação + +Estou à disposição para ajudar com mais informações!"#; + + // Should detect list + assert!(contains_list(content), "Should detect numbered list in content"); + + // Count list items + let list_items: Vec<&str> = content.lines().filter(|l| is_list_item(l)).collect(); + assert_eq!(list_items.len(), 8, "Should detect all 8 list items"); + + // Verify each item is detected + for (i, item) in list_items.iter().enumerate() { + assert!(item.starts_with(&format!("{}.", i + 1)), + "Item {} should start with '{}.'", i + 1, i + 1); + } + + // NEW: Should detect that list has ended (content after list) + assert!(looks_like_list_end(content), "Should detect list has ended"); + } + + #[test] + fn test_looks_like_list_end() { + // List with content after - should detect as ended + let with_content_after = r#"Cursos disponíveis: + +1. Ensino Fundamental I +2. Ensino Fundamental II +3. Ensino Médio + +Entre em contato para mais informações."#; + assert!(looks_like_list_end(with_content_after), "Should detect list ended with content after"); + + // List with multiple paragraphs after + let with_multiple_after = r#"Opções: + +1. Opção A +2. Opção B +3. Opção C + +Texto adicional aqui. + +Mais um parágrafo."#; + assert!(looks_like_list_end(with_multiple_after), "Should detect list ended with multiple paragraphs"); + + // List still in progress - should NOT detect as ended + let in_progress = r#"Cursos: + +1. Curso A +2. Curso B +3."#; + assert!(!looks_like_list_end(in_progress), "Should NOT detect list ended (still in progress)"); + + // List with only one line after (need 2+ to confirm end) + let one_line_after = r#"Cursos: + +1. Curso A +2. Curso B + +Uma linha apenas."#; + // This has 2 non-empty lines after (empty line + text), so it should detect as ended + assert!(looks_like_list_end(one_line_after), "Should detect list ended with 2+ non-empty lines after"); + + // No list at all + let no_list = "Apenas texto normal sem lista."; + assert!(!looks_like_list_end(no_list), "Should NOT detect list ended (no list present)"); + + // List with blank lines after (but no content) + let list_with_blanks = r#"Lista: + +1. Item 1 +2. Item 2 + + +"#; + assert!(!looks_like_list_end(list_with_blanks), "Should NOT detect list ended (only blank lines after)"); + } + + #[test] + fn test_split_text_before_list() { + // Text with list in the middle + let text1 = "Texto antes da lista\n\n1. Primeiro item\n2. Segundo item\n\nTexto depois"; + let (before, rest) = split_text_before_list(text1); + assert_eq!(before, "Texto antes da lista\n"); + assert!(rest.starts_with("1. Primeiro item")); + + // List at the start (no text before) + let text2 = "1. Item 1\n2. Item 2"; + let (before, rest) = split_text_before_list(text2); + assert_eq!(before, ""); + assert_eq!(rest, "1. Item 1\n2. Item 2"); + + // No list at all + let text3 = "Apenas texto sem lista"; + let (before, rest) = split_text_before_list(text3); + assert_eq!(before, "Apenas texto sem lista"); + assert_eq!(rest, ""); + + // Multiple paragraphs before list + let text4 = "Parágrafo 1\n\nParágrafo 2\n\n1. Item"; + let (before, rest) = split_text_before_list(text4); + assert_eq!(before, "Parágrafo 1\n\nParágrafo 2\n"); + assert_eq!(rest, "1. Item"); + + // Bullet list + let text5 = "Introdução\n- Item 1\n- Item 2"; + let (before, rest) = split_text_before_list(text5); + assert_eq!(before, "Introdução"); + assert!(rest.starts_with("- Item 1")); + } + + #[test] + fn test_split_list_from_text() { + // List with text after + let text1 = "1. Primeiro item\n2. Segundo item\n\nTexto depois da lista"; + let (list, after) = split_list_from_text(text1); + assert_eq!(list, "1. Primeiro item\n2. Segundo item\n"); + assert_eq!(after, "Texto depois da lista"); + + // List at the end (no text after) + let text2 = "1. Item 1\n2. Item 2"; + let (list, after) = split_list_from_text(text2); + assert_eq!(list, "1. Item 1\n2. Item 2"); + assert_eq!(after, ""); + + // List only + let text3 = "1. Item"; + let (list, after) = split_list_from_text(text3); + assert_eq!(list, "1. Item"); + assert_eq!(after, ""); + + // List with blank lines after + let text4 = "1. Item 1\n2. Item 2\n\n\nTexto"; + let (list, after) = split_list_from_text(text4); + assert_eq!(list, "1. Item 1\n2. Item 2\n\n"); + assert_eq!(after, "Texto"); + + // Bullet list with text after + let text5 = "- Item 1\n- Item 2\n\nConclusão"; + let (list, after) = split_list_from_text(text5); + assert_eq!(list, "- Item 1\n- Item 2\n"); + assert_eq!(after, "Conclusão"); + + // Multiple paragraphs after list + let text6 = "1. Item\n\nTexto 1\n\nTexto 2"; + let (list, after) = split_list_from_text(text6); + assert_eq!(list, "1. Item\n"); + assert_eq!(after, "Texto 1\n\nTexto 2"); + } + + #[test] + fn test_list_isolation_scenario() { + // Test the complete scenario from zap.md example + let full_text = r#"Olá! 😊 + +Infelizmente, não tenho a informação específica sobre o horário de funcionamento da Escola Salesiana no momento. + +Para obter essa informação, você pode: +1. *Entrar em contato com a secretaria* - Posso te ajudar +2. *Agendar uma visita* - Assim você conhece a escola + +Gostaria que eu te ajudasse?"#; + + // Step 1: Split text before list + let (text_before, rest) = split_text_before_list(full_text); + assert!(text_before.contains("Olá!")); + assert!(text_before.contains("Para obter essa informação, você pode:")); + assert!(rest.starts_with("1. *Entrar em contato")); + + // Step 2: Split list from text after + let (list, text_after) = split_list_from_text(&rest); + assert!(list.starts_with("1. *Entrar em contato")); + assert!(list.contains("2. *Agendar uma visita")); + assert!(text_after.contains("Gostaria que eu te ajudasse?")); + } + + #[test] + fn test_partial_list_streaming() { + // Simulate streaming chunks arriving + let chunks = vec![ + "Aulas Disponíveis:\n\n", + "1. Aula de Violão\n\n", + "2. Aula de Piano\n\n", + "3. Aula de Canto\n\n", + ]; + + let mut buffer = String::new(); + for (i, chunk) in chunks.iter().enumerate() { + buffer.push_str(chunk); + + // After first chunk, should detect potential list start + if i == 0 { + assert!(looks_like_list_start(&buffer) || contains_list(&buffer), + "After chunk 0, should detect list start or contain list"); + } + + // After second chunk onwards, should detect list + if i >= 1 { + assert!(contains_list(&buffer), + "After chunk {}, should detect list", i); + } + } + } }