From 5c54f7bc72e6700cb7bc8104064f2135019a0d34 Mon Sep 17 00:00:00 2001 From: Rodrigo Rodriguez Date: Thu, 30 Apr 2026 16:10:36 -0300 Subject: [PATCH] fix: robust HTML stripping for XLSX to remove CSS garbage and decode entities --- .../src/core/kb/document_processor/mod.rs | 21 ++++++++++++++++--- botserver/src/drive/vectordb.rs | 11 +++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/botserver/src/core/kb/document_processor/mod.rs b/botserver/src/core/kb/document_processor/mod.rs index 84e030c3..c2198d7e 100644 --- a/botserver/src/core/kb/document_processor/mod.rs +++ b/botserver/src/core/kb/document_processor/mod.rs @@ -344,9 +344,24 @@ Ok(result) /// Remove HTML tags from string fn strip_html_tags(s: &str) -> String { -// Remove HTML tags: , , -let re = regex::Regex::new(r"<[^>]*>").unwrap(); -re.replace_all(s, "").to_string() + // 1. Remove