fix: robust HTML stripping for XLSX to remove CSS garbage and decode entities
All checks were successful
BotServer CI / build (push) Successful in 46s
All checks were successful
BotServer CI / build (push) Successful in 46s
This commit is contained in:
parent
2cf30ac388
commit
5c54f7bc72
2 changed files with 26 additions and 6 deletions
|
|
@ -344,9 +344,24 @@ Ok(result)
|
||||||
|
|
||||||
/// Remove HTML tags from string
|
/// Remove HTML tags from string
|
||||||
fn strip_html_tags(s: &str) -> String {
|
fn strip_html_tags(s: &str) -> String {
|
||||||
// Remove HTML tags: <tag>, <tag attr="value">, </tag>
|
// 1. Remove <style> and <script> blocks entirely
|
||||||
let re = regex::Regex::new(r"<[^>]*>").unwrap();
|
let re_style = regex::Regex::new(r"(?is)<(style|script)[^>]*>.*?</\1>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||||
re.replace_all(s, "").to_string()
|
let s1 = re_style.replace_all(s, "");
|
||||||
|
|
||||||
|
// 2. Remove all other HTML tags
|
||||||
|
let re_tags = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||||
|
let s2 = re_tags.replace_all(&s1, "");
|
||||||
|
|
||||||
|
// 3. Replace common entities
|
||||||
|
let s3 = s2.replace(" ", " ")
|
||||||
|
.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace(""", "\"");
|
||||||
|
|
||||||
|
// 4. Collapse multiple spaces/newlines
|
||||||
|
let re_spaces = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||||
|
re_spaces.replace_all(&s3, " ").to_string().trim().to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "kb-extraction")]
|
#[cfg(feature = "kb-extraction")]
|
||||||
|
|
|
||||||
|
|
@ -697,9 +697,14 @@ async fn extract_xlsx_text(file_path: &Path) -> Result<String> {
|
||||||
calamine::Data::String(s)
|
calamine::Data::String(s)
|
||||||
| calamine::Data::DateTimeIso(s)
|
| calamine::Data::DateTimeIso(s)
|
||||||
| calamine::Data::DurationIso(s) => {
|
| calamine::Data::DurationIso(s) => {
|
||||||
// Remove HTML tags from cell text (Calamine formatting artifacts)
|
// Remove HTML tags and formatting artifacts
|
||||||
let re = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
let re_style = regex::Regex::new(r"(?is)<(style|script)[^>]*>.*?</\1>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||||
re.replace_all(s, "").to_string()
|
let s1 = re_style.replace_all(s, "");
|
||||||
|
let re_tags = regex::Regex::new(r"<[^>]*>").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||||
|
let s2 = re_tags.replace_all(&s1, "");
|
||||||
|
let s3 = s2.replace(" ", " ").replace("&", "&").replace("<", "<").replace(">", ">").replace(""", "\"");
|
||||||
|
let re_spaces = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(r"x{0}").unwrap());
|
||||||
|
re_spaces.replace_all(&s3, " ").to_string().trim().to_string()
|
||||||
},
|
},
|
||||||
calamine::Data::Float(f) => f.to_string(),
|
calamine::Data::Float(f) => f.to_string(),
|
||||||
calamine::Data::Int(i) => i.to_string(),
|
calamine::Data::Int(i) => i.to_string(),
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue