fix: robust HTML stripping for XLSX to remove CSS garbage and decode entities
All checks were successful
BotServer CI / build (push) Successful in 46s

This commit is contained in:
Rodrigo Rodriguez 2026-04-30 16:10:36 -03:00
parent 2cf30ac388
commit 5c54f7bc72
2 changed files with 26 additions and 6 deletions

View file

@ -344,9 +344,24 @@ Ok(result)
/// Remove HTML tags from string
fn strip_html_tags(s: &str) -> String {
// Remove HTML tags: <tag>, <tag attr="value">, </tag>
let re = regex::Regex::new(r"<[^>]*>").unwrap();
re.replace_all(s, "").to_string()
// 1. Remove <style> and <script> blocks entirely
let re_style = regex::Regex::new(r"(?is)<(style|script)[^>]*>.*?</\1>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
let s1 = re_style.replace_all(s, "");
// 2. Remove all other HTML tags
let re_tags = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
let s2 = re_tags.replace_all(&s1, "");
// 3. Replace common entities
let s3 = s2.replace("&nbsp;", " ")
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"");
// 4. Collapse multiple spaces/newlines
let re_spaces = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
re_spaces.replace_all(&s3, " ").to_string().trim().to_string()
}
#[cfg(feature = "kb-extraction")]

View file

@ -697,9 +697,14 @@ async fn extract_xlsx_text(file_path: &Path) -> Result<String> {
calamine::Data::String(s)
| calamine::Data::DateTimeIso(s)
| calamine::Data::DurationIso(s) => {
// Remove HTML tags from cell text (Calamine formatting artifacts)
let re = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
re.replace_all(s, "").to_string()
// Remove HTML tags and formatting artifacts
let re_style = regex::Regex::new(r"(?is)<(style|script)[^>]*>.*?</\1>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
let s1 = re_style.replace_all(s, "");
let re_tags = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
let s2 = re_tags.replace_all(&s1, "");
let s3 = s2.replace("&nbsp;", " ").replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", "\"");
let re_spaces = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
re_spaces.replace_all(&s3, " ").to_string().trim().to_string()
},
calamine::Data::Float(f) => f.to_string(),
calamine::Data::Int(i) => i.to_string(),