fix: robust HTML stripping for XLSX to remove CSS garbage and decode entities
All checks were successful
BotServer CI / build (push) Successful in 46s
All checks were successful
BotServer CI / build (push) Successful in 46s
This commit is contained in:
parent
2cf30ac388
commit
5c54f7bc72
2 changed files with 26 additions and 6 deletions
|
|
@ -344,9 +344,24 @@ Ok(result)
|
|||
|
||||
/// Remove HTML tags from string
|
||||
fn strip_html_tags(s: &str) -> String {
|
||||
// Remove HTML tags: <tag>, <tag attr="value">, </tag>
|
||||
let re = regex::Regex::new(r"<[^>]*>").unwrap();
|
||||
re.replace_all(s, "").to_string()
|
||||
// 1. Remove <style> and <script> blocks entirely
|
||||
let re_style = regex::Regex::new(r"(?is)<(style|script)[^>]*>.*?</\1>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||
let s1 = re_style.replace_all(s, "");
|
||||
|
||||
// 2. Remove all other HTML tags
|
||||
let re_tags = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||
let s2 = re_tags.replace_all(&s1, "");
|
||||
|
||||
// 3. Replace common entities
|
||||
let s3 = s2.replace(" ", " ")
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"");
|
||||
|
||||
// 4. Collapse multiple spaces/newlines
|
||||
let re_spaces = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||
re_spaces.replace_all(&s3, " ").to_string().trim().to_string()
|
||||
}
|
||||
|
||||
#[cfg(feature = "kb-extraction")]
|
||||
|
|
|
|||
|
|
@ -697,9 +697,14 @@ async fn extract_xlsx_text(file_path: &Path) -> Result<String> {
|
|||
calamine::Data::String(s)
|
||||
| calamine::Data::DateTimeIso(s)
|
||||
| calamine::Data::DurationIso(s) => {
|
||||
// Remove HTML tags from cell text (Calamine formatting artifacts)
|
||||
let re = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||
re.replace_all(s, "").to_string()
|
||||
// Remove HTML tags and formatting artifacts
|
||||
let re_style = regex::Regex::new(r"(?is)<(style|script)[^>]*>.*?</\1>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||
let s1 = re_style.replace_all(s, "");
|
||||
let re_tags = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||
let s2 = re_tags.replace_all(&s1, "");
|
||||
let s3 = s2.replace(" ", " ").replace("&", "&").replace("<", "<").replace(">", ">").replace(""", "\"");
|
||||
let re_spaces = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||
re_spaces.replace_all(&s3, " ").to_string().trim().to_string()
|
||||
},
|
||||
calamine::Data::Float(f) => f.to_string(),
|
||||
calamine::Data::Int(i) => i.to_string(),
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue