diff --git a/Cargo.lock b/Cargo.lock index d999385aa..323994b11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -508,20 +508,6 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" -[[package]] -name = "auto_generate_cdp" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6e1961a0d5d77969057eba90d448e610d3c439024d135d9dbd98e33ec973520" -dependencies = [ - "convert_case", - "proc-macro2", - "quote", - "serde", - "serde_json", - "ureq 2.12.1", -] - [[package]] name = "autocfg" version = "1.5.0" @@ -1127,7 +1113,6 @@ dependencies = [ "env_logger", "futures", "futures-util", - "headless_chrome", "hmac", "imap", "include_dir", @@ -1146,7 +1131,6 @@ dependencies = [ "regex", "reqwest", "rhai", - "scraper", "serde", "serde_json", "sha2", @@ -1158,7 +1142,7 @@ dependencies = [ "tokio-stream", "tracing", "tracing-subscriber", - "ureq 3.1.2", + "ureq", "urlencoding", "uuid", "zip 2.4.2", @@ -1632,29 +1616,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "cssparser" -version = "0.31.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" -dependencies = [ - "cssparser-macros", - "dtoa-short", - "itoa", - "phf 0.11.3", - "smallvec", -] - -[[package]] -name = "cssparser-macros" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" -dependencies = [ - "quote", - "syn", -] - [[package]] name = "csv" version = "1.4.0" @@ -2024,21 +1985,6 @@ dependencies = [ "syn", ] -[[package]] -name = "dtoa" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04" - -[[package]] -name = "dtoa-short" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" -dependencies = [ - "dtoa", -] - [[package]] name = "dunce" version = "1.0.5" @@ -2066,12 +2012,6 @@ dependencies = [ "signature", ] -[[package]] -name = "ego-tree" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" - [[package]] name = "either" version = "1.15.0" @@ -2139,12 +2079,6 @@ dependencies = [ "regex", ] -[[package]] -name = "env_home" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe" - [[package]] name = "env_logger" version = "0.11.8" @@ -2279,16 +2213,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - [[package]] name = "futures" version = "0.3.31" @@ -2378,15 +2302,6 @@ dependencies = [ "slab", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "generic-array" version = "0.14.9" @@ -2397,15 +2312,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "getopts" -version = "0.2.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" -dependencies = [ - "unicode-width", -] - [[package]] name = "getrandom" version = "0.2.16" @@ -2531,29 +2437,6 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" -[[package]] -name = "headless_chrome" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f77a421a200d6314c8830919715d8452320c16e06b37686b13a9942f799dbf9b" -dependencies = [ - "anyhow", - "auto_generate_cdp", - "base64 0.22.1", - "derive_builder", - "log", - "rand 0.9.2", - "regex", - "serde", - "serde_json", - "tempfile", - "thiserror 2.0.17", - "tungstenite 0.27.0", - "url", - "which", - "winreg", -] - [[package]] name = "heck" version = "0.4.1" @@ -2592,20 +2475,6 @@ dependencies = [ "windows-link 0.1.3", ] -[[package]] -name = "html5ever" -version = "0.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" -dependencies = [ - "log", - "mac", - "markup5ever", - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "http" version = "0.2.12" @@ -2752,7 +2621,7 @@ dependencies = [ "tokio", "tokio-rustls 0.26.4", "tower-service", - "webpki-roots 1.0.3", + "webpki-roots", ] [[package]] @@ -3464,12 +3333,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - [[package]] name = "mailparse" version = "0.15.0" @@ -3481,20 +3344,6 @@ dependencies = [ "quoted_printable", ] -[[package]] -name = "markup5ever" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" -dependencies = [ - "log", - "phf 0.11.3", - "phf_codegen 0.11.3", - "string_cache", - "string_cache_codegen", - "tendril", -] - [[package]] name = "matchers" version = "0.2.0" @@ -3607,12 +3456,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "new_debug_unreachable" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" - [[package]] name = "nom" version = "7.1.3" @@ -3982,96 +3825,6 @@ dependencies = [ "indexmap 2.12.0", ] -[[package]] -name = "phf" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" -dependencies = [ - "phf_shared 0.10.0", -] - -[[package]] -name = "phf" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" -dependencies = [ - "phf_macros", - "phf_shared 0.11.3", -] - -[[package]] -name = "phf_codegen" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", -] - -[[package]] -name = "phf_codegen" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" -dependencies = [ - "phf_generator 0.11.3", - "phf_shared 0.11.3", -] - -[[package]] -name = "phf_generator" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" -dependencies = [ - "phf_shared 0.10.0", - "rand 0.8.5", -] - -[[package]] -name = "phf_generator" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" -dependencies = [ - "phf_shared 0.11.3", - "rand 0.8.5", -] - -[[package]] -name = "phf_macros" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" -dependencies = [ - "phf_generator 0.11.3", - "phf_shared 0.11.3", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "phf_shared" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" -dependencies = [ - "siphasher 0.3.11", -] - -[[package]] -name = "phf_shared" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" -dependencies = [ - "siphasher 1.0.1", -] - [[package]] name = "pin-project" version = "1.1.10" @@ -4194,12 +3947,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "precomputed-hash" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" - [[package]] name = "prettyplease" version = "0.2.37" @@ -4603,7 +4350,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.3", + "webpki-roots", ] [[package]] @@ -4824,22 +4571,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "scraper" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0" -dependencies = [ - "ahash", - "cssparser", - "ego-tree", - "getopts", - "html5ever", - "once_cell", - "selectors", - "tendril", -] - [[package]] name = "scratch" version = "1.0.9" @@ -4906,25 +4637,6 @@ dependencies = [ "libc", ] -[[package]] -name = "selectors" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" -dependencies = [ - "bitflags", - "cssparser", - "derive_more 0.99.20", - "fxhash", - "log", - "new_debug_unreachable", - "phf 0.10.1", - "phf_codegen 0.10.0", - "precomputed-hash", - "servo_arc", - "smallvec", -] - [[package]] name = "semver" version = "1.0.27" @@ -4995,15 +4707,6 @@ dependencies = [ "serde", ] -[[package]] -name = "servo_arc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" -dependencies = [ - "stable_deref_trait", -] - [[package]] name = "sha1" version = "0.10.6" @@ -5078,18 +4781,6 @@ version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" -[[package]] -name = "siphasher" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" - -[[package]] -name = "siphasher" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" - [[package]] name = "slab" version = "0.4.11" @@ -5133,17 +4824,6 @@ dependencies = [ "windows-sys 0.60.2", ] -[[package]] -name = "socks" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" -dependencies = [ - "byteorder", - "libc", - "winapi", -] - [[package]] name = "spki" version = "0.6.0" @@ -5179,31 +4859,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "string_cache" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" -dependencies = [ - "new_debug_unreachable", - "parking_lot", - "phf_shared 0.11.3", - "precomputed-hash", - "serde", -] - -[[package]] -name = "string_cache_codegen" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" -dependencies = [ - "phf_generator 0.11.3", - "phf_shared 0.11.3", - "proc-macro2", - "quote", -] - [[package]] name = "stringprep" version = "0.1.5" @@ -5306,17 +4961,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "tendril" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" -dependencies = [ - "futf", - "mac", - "utf-8", -] - [[package]] name = "termcolor" version = "1.4.1" @@ -5546,7 +5190,7 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite 0.20.1", + "tungstenite", ] [[package]] @@ -5754,23 +5398,6 @@ dependencies = [ "utf-8", ] -[[package]] -name = "tungstenite" -version = "0.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eadc29d668c91fcc564941132e17b28a7ceb2f3ebf0b9dae3e03fd7a6748eb0d" -dependencies = [ - "bytes", - "data-encoding", - "http 1.3.1", - "httparse", - "log", - "rand 0.9.2", - "sha1", - "thiserror 2.0.17", - "utf-8", -] - [[package]] name = "type1-encoding-parser" version = "0.1.0" @@ -5847,23 +5474,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "ureq" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" -dependencies = [ - "base64 0.22.1", - "flate2", - "log", - "once_cell", - "rustls 0.23.34", - "rustls-pki-types", - "socks", - "url", - "webpki-roots 0.26.11", -] - [[package]] name = "ureq" version = "3.1.2" @@ -5879,7 +5489,7 @@ dependencies = [ "rustls-pki-types", "ureq-proto", "utf-8", - "webpki-roots 1.0.3", + "webpki-roots", ] [[package]] @@ -6091,15 +5701,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki-roots" -version = "0.26.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" -dependencies = [ - "webpki-roots 1.0.3", -] - [[package]] name = "webpki-roots" version = "1.0.3" @@ -6145,17 +5746,6 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3" -[[package]] -name = "which" -version = "8.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3fabb953106c3c8eea8306e4393700d7657561cb43122571b172bbfb7c7ba1d" -dependencies = [ - "env_home", - "rustix", - "winsafe", -] - [[package]] name = "winapi" version = "0.3.9" @@ -6577,22 +6167,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" -[[package]] -name = "winreg" -version = "0.55.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb5a765337c50e9ec252c2069be9bf91c7df47afb103b642ba3a53bf8101be97" -dependencies = [ - "cfg-if", - "windows-sys 0.59.0", -] - -[[package]] -name = "winsafe" -version = "0.0.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" - [[package]] name = "wit-bindgen" version = "0.46.0" diff --git a/Cargo.toml b/Cargo.toml index e69303969..c52c76d4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,7 +40,6 @@ repository = "https://github.com/GeneralBots/BotServer" default = [ "vectordb"] vectordb = ["qdrant-client"] email = ["imap"] -web_automation = ["headless_chrome"] desktop = [] [dependencies] @@ -65,7 +64,6 @@ downloader = "0.2" env_logger = "0.11" futures = "0.3" futures-util = "0.3" -headless_chrome = { version = "1.0.18", optional = true } hmac = "0.12.1" imap = { version = "3.0.0-alpha.15", optional = true } include_dir = "0.7" @@ -84,7 +82,6 @@ redis = { version = "0.27", features = ["tokio-comp"] } regex = "1.11" reqwest = { version = "0.12", features = ["json", "stream"] } rhai = { git = "https://github.com/therealprof/rhai.git", branch = "features/use-web-time" } -scraper = "0.20" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" sha2 = "0.10.9" diff --git a/src/basic/keywords/add_website.rs b/src/basic/keywords/add_website.rs index 47867c50b..f74885c27 100644 --- a/src/basic/keywords/add_website.rs +++ b/src/basic/keywords/add_website.rs @@ -1,7 +1,5 @@ use crate::shared::models::UserSession; use crate::shared::state::AppState; -#[cfg(feature = "web_automation")] -use crate::web_automation::WebCrawler; use log::{error, info}; use rhai::{Dynamic, Engine}; use std::sync::Arc; @@ -21,9 +19,6 @@ pub fn add_website_keyword(state: Arc, user: UserSession, engine: &mut ); // Validate URL - #[cfg(feature = "web_automation")] - let is_valid = WebCrawler::is_valid_url(&url_str); - #[cfg(not(feature = "web_automation"))] let is_valid = url_str.starts_with("http://") || url_str.starts_with("https://"); if !is_valid { @@ -92,74 +87,5 @@ async fn crawl_and_index_website( url: &str, ) -> Result { info!("Crawling website: {} for user: {}", url, user.user_id); - - // Check if web_automation feature is enabled - #[cfg(not(feature = "web_automation"))] - { - return Err( - "Web automation feature not enabled. Recompile with --features web_automation" - .to_string(), - ); - } - - // Fetch website content (only compiled if feature enabled) - #[cfg(feature = "web_automation")] - { - let crawler = WebCrawler::new(); - let text_content = crawler - .crawl(url) - .await - .map_err(|e| format!("Failed to crawl website: {}", e))?; - - if text_content.trim().is_empty() { - return Err("No text content found on website".to_string()); - } - - info!( - "Extracted {} characters of text from website", - text_content.len() - ); - - // Create KB name from URL - let kb_name = format!( - "website_{}", - url.replace("https://", "") - .replace("http://", "") - .replace('/', "_") - .replace('.', "_") - .chars() - .take(50) - .collect::() - ); - - // Create collection name for this user's website KB - let collection_name = format!("kb_{}_{}_{}", user.bot_id, user.user_id, kb_name); - - // Ensure collection exists in Qdrant - crate::kb::qdrant_client::ensure_collection_exists(_state, &collection_name) - .await - .map_err(|e| format!("Failed to create Qdrant collection: {}", e))?; - - // Index the content - crate::kb::embeddings::index_document(_state, &collection_name, url, &text_content) - .await - .map_err(|e| format!("Failed to index document: {}", e))?; - - // Associate KB with user (not session) - add_website_kb_to_user(_state, user, &kb_name, url) - .await - .map_err(|e| format!("Failed to associate KB with user: {}", e))?; - - info!( - "Website indexed successfully to collection: {}", - collection_name - ); - - Ok(format!( - "Website '{}' crawled and indexed successfully ({} characters)", - url, - text_content.len() - )) - } + Err("Web automation functionality has been removed from this build".to_string()) } - diff --git a/src/basic/keywords/get_website.rs b/src/basic/keywords/get_website.rs deleted file mode 100644 index 66be3ea41..000000000 --- a/src/basic/keywords/get_website.rs +++ /dev/null @@ -1,135 +0,0 @@ -use crate::{shared::state::AppState, shared::models::UserSession, web_automation::BrowserPool}; -use headless_chrome::browser::tab::Tab; -use log::info; -use rhai::{Dynamic, Engine}; -use std::error::Error; -use std::sync::Arc; -use std::time::Duration; -use tokio::time::sleep; - -pub fn get_website_keyword(state: &AppState, user: UserSession, engine: &mut Engine) { - let browser_pool = state.browser_pool.clone(); - - engine - .register_custom_syntax( - &["WEBSITE", "OF", "$expr$"], - false, - move |context, inputs| { - let search_term = context.eval_expression_tree(&inputs[0])?.to_string(); - - info!("GET WEBSITE executed - Search: '{}'", search_term); - - let browser_pool_clone = browser_pool.clone(); - let fut = execute_headless_browser_search(browser_pool_clone, &search_term); - - let result = - tokio::task::block_in_place(|| tokio::runtime::Handle::current().block_on(fut)) - .map_err(|e| format!("Headless browser search failed: {}", e))?; - - Ok(Dynamic::from(result)) - }, - ) - .unwrap(); -} - -pub async fn execute_headless_browser_search( - browser_pool: Arc, - search_term: &str, -) -> Result> { - info!("Starting headless browser search: '{}' ", search_term); - - let term = search_term.to_string(); - - let result = browser_pool - .with_browser(move |tab| { - let term = term.clone(); - Box::pin(async move { perform_search(tab, &term).await }) - }) - .await?; - - Ok(result) -} - -async fn perform_search( - tab: Arc, - search_term: &str, -) -> Result> { - tab.navigate_to("https://duckduckgo.com") - .map_err(|e| format!("Failed to navigate: {}", e))?; - - tab.wait_for_element("#searchbox_input") - .map_err(|e| format!("Failed to find search box: {}", e))?; - - let search_input = tab - .find_element("#searchbox_input") - .map_err(|e| format!("Failed to find search input: {}", e))?; - - search_input - .click() - .map_err(|e| format!("Failed to click search input: {}", e))?; - - search_input - .type_into(search_term) - .map_err(|e| format!("Failed to type into search input: {}", e))?; - - search_input - .press_key("Enter") - .map_err(|e| format!("Failed to press Enter: {}", e))?; - - sleep(Duration::from_millis(3000)).await; - - let _ = tab.wait_for_element("[data-testid='result']"); - - let results = extract_search_results(&tab).await?; - - if !results.is_empty() { - Ok(results[0].clone()) - } else { - Ok("No results found".to_string()) - } -} - -async fn extract_search_results( - tab: &Arc, -) -> Result, Box> { - let mut results = Vec::new(); - - let selectors = [ - "a[data-testid='result-title-a']", - "a[data-testid='result-extras-url-link']", - "a.eVNpHGjtxRBq_gLOfGDr", - "a.Rn_JXVtoPVAFyGkcaXyK", - ".ikg2IXiCD14iVX7AdZo1 a", - ".OQ_6vPwNhCeusNiEDcGp a", - ".result__a", - "a.result-link", - ".result a[href]", - ]; - - for selector in &selectors { - if let Ok(elements) = tab.find_elements(selector) { - for element in elements { - if let Ok(Some(href)) = element.get_attribute_value("href") { - if href.starts_with("http") - && !href.contains("duckduckgo.com") - && !href.contains("duck.co") - && !results.contains(&href) - { - let display_text = element.get_inner_text().unwrap_or_default(); - - if !display_text.is_empty() && !display_text.contains("Ad") { - results.push(href); - } - } - } - } - if !results.is_empty() { - break; - } - } - } - - results.dedup(); - - Ok(results) -} diff --git a/src/basic/keywords/mod.rs b/src/basic/keywords/mod.rs index 0dbafddc7..3ad5bd360 100644 --- a/src/basic/keywords/mod.rs +++ b/src/basic/keywords/mod.rs @@ -24,6 +24,3 @@ pub mod set_context; #[cfg(feature = "email")] pub mod create_draft_keyword; - -#[cfg(feature = "web_automation")] -pub mod get_website; diff --git a/src/basic/mod.rs b/src/basic/mod.rs index 8b7902447..672ac63de 100644 --- a/src/basic/mod.rs +++ b/src/basic/mod.rs @@ -34,8 +34,6 @@ use self::keywords::add_suggestion::add_suggestion_keyword; #[cfg(feature = "email")] use self::keywords::create_draft_keyword; -#[cfg(feature = "web_automation")] -use self::keywords::get_website::get_website_keyword; pub struct ScriptService { pub engine: Engine, @@ -80,8 +78,6 @@ impl ScriptService { add_website_keyword(state.clone(), user.clone(), &mut engine); add_suggestion_keyword(state.clone(), user.clone(), &mut engine); - #[cfg(feature = "web_automation")] - get_website_keyword(&state, user.clone(), &mut engine); ScriptService { engine, diff --git a/src/lib.rs b/src/lib.rs index 087bc3296..2196c8e76 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,8 +16,6 @@ pub mod package_manager; pub mod session; pub mod shared; pub mod tests; -#[cfg(feature = "web_automation")] -pub mod web_automation; pub mod web_server; pub mod auth; pub mod nvidia; diff --git a/src/main.rs b/src/main.rs index 7ee9c6e04..0d1f2a54e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -28,8 +28,6 @@ mod package_manager; mod session; mod shared; pub mod tests; -#[cfg(feature = "web_automation")] -mod web_automation; mod web_server; mod nvidia; diff --git a/src/web_automation/crawler.rs b/src/web_automation/crawler.rs deleted file mode 100644 index 17b94e9dc..000000000 --- a/src/web_automation/crawler.rs +++ /dev/null @@ -1,227 +0,0 @@ -use log::{debug, error, info}; -use reqwest::Client; -use scraper::{Html, Selector}; -use std::error::Error; -use std::time::Duration; - -/// Web crawler for extracting content from web pages -pub struct WebCrawler { - client: Client, -} - -impl WebCrawler { - /// Create a new web crawler - pub fn new() -> Self { - let client = Client::builder() - .timeout(Duration::from_secs(30)) - .connect_timeout(Duration::from_secs(10)) - .user_agent("Mozilla/5.0 (compatible; GeneralBots/1.0)") - .build() - .unwrap_or_else(|_| Client::new()); - - Self { client } - } - - /// Validate if string is a valid HTTP(S) URL - pub fn is_valid_url(url: &str) -> bool { - url.starts_with("http://") || url.starts_with("https://") - } - - /// Fetch website content via HTTP - pub async fn fetch_content(&self, url: &str) -> Result> { - debug!("Fetching website content from: {}", url); - - let response = self.client.get(url).send().await?; - - if !response.status().is_success() { - return Err(format!("HTTP request failed with status: {}", response.status()).into()); - } - - let content_type = response - .headers() - .get("content-type") - .and_then(|v| v.to_str().ok()) - .unwrap_or(""); - - if !content_type.contains("text/html") && !content_type.contains("application/xhtml") { - return Err(format!("URL does not return HTML content: {}", content_type).into()); - } - - let html_content = response.text().await?; - debug!("Fetched {} bytes of HTML content", html_content.len()); - - Ok(html_content) - } - - /// Extract readable text from HTML - pub fn extract_text_from_html( - &self, - html: &str, - ) -> Result> { - let document = Html::parse_document(html); - - let mut text_parts = Vec::new(); - - // Extract title - let title_selector = Selector::parse("title").unwrap(); - if let Some(title_element) = document.select(&title_selector).next() { - let title = title_element.text().collect::(); - if !title.trim().is_empty() { - text_parts.push(format!("Title: {}\n", title.trim())); - } - } - - // Extract meta description - let meta_selector = Selector::parse("meta[name='description']").unwrap(); - if let Some(meta) = document.select(&meta_selector).next() { - if let Some(description) = meta.value().attr("content") { - if !description.trim().is_empty() { - text_parts.push(format!("Description: {}\n", description.trim())); - } - } - } - - // Extract body content - let body_selector = Selector::parse("body").unwrap(); - if let Some(body) = document.select(&body_selector).next() { - self.extract_text_recursive(&body, &mut text_parts); - } else { - // Fallback: extract from entire document - for node in document.root_element().descendants() { - if let Some(text) = node.value().as_text() { - let cleaned = text.trim(); - if !cleaned.is_empty() { - text_parts.push(cleaned.to_string()); - } - } - } - } - - let combined_text = text_parts.join("\n"); - - // Clean up excessive whitespace - let cleaned = combined_text - .lines() - .map(|line| line.trim()) - .filter(|line| !line.is_empty()) - .collect::>() - .join("\n"); - - if cleaned.is_empty() { - return Err("Failed to extract text from HTML".into()); - } - - Ok(cleaned) - } - - /// Recursively extract text from HTML element tree - fn extract_text_recursive(&self, element: &scraper::ElementRef, text_parts: &mut Vec) { - // Skip excluded elements (script, style, etc.) - let excluded = ["script", "style", "noscript", "iframe", "svg"]; - if excluded.contains(&element.value().name()) { - return; - } - - for child in element.children() { - if let Some(text) = child.value().as_text() { - let cleaned = text.trim(); - if !cleaned.is_empty() { - text_parts.push(cleaned.to_string()); - } - } else if child.value().as_element().is_some() { - if let Some(child_ref) = scraper::ElementRef::wrap(child) { - self.extract_text_recursive(&child_ref, text_parts); - } - } - } - } - - /// Crawl a URL and return extracted text - pub async fn crawl(&self, url: &str) -> Result> { - info!("Crawling website: {}", url); - - if !Self::is_valid_url(url) { - return Err("Invalid URL format".into()); - } - - let html_content = self.fetch_content(url).await?; - let text_content = self.extract_text_from_html(&html_content)?; - - if text_content.trim().is_empty() { - return Err("No text content found on website".into()); - } - - info!( - "Successfully crawled website: {} ({} characters)", - url, - text_content.len() - ); - - Ok(text_content) - } -} - -impl Default for WebCrawler { - fn default() -> Self { - Self::new() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_is_valid_url() { - assert!(WebCrawler::is_valid_url("https://example.com")); - assert!(WebCrawler::is_valid_url("http://example.com")); - assert!(WebCrawler::is_valid_url("https://example.com/path?query=1")); - - assert!(!WebCrawler::is_valid_url("ftp://example.com")); - assert!(!WebCrawler::is_valid_url("example.com")); - assert!(!WebCrawler::is_valid_url("//example.com")); - assert!(!WebCrawler::is_valid_url("file:///etc/passwd")); - } - - #[test] - fn test_extract_text_from_html() { - let crawler = WebCrawler::new(); - - let html = r#" - - - - Test Page - - - - - -

Welcome

-

This is a paragraph.

-
- Nested content -
- - - "#; - - let result = crawler.extract_text_from_html(html).unwrap(); - - assert!(result.contains("Title: Test Page")); - assert!(result.contains("Description: This is a test page")); - assert!(result.contains("Welcome")); - assert!(result.contains("This is a paragraph")); - assert!(result.contains("Nested content")); - assert!(!result.contains("console.log")); - assert!(!result.contains("color: red")); - } - - #[test] - fn test_extract_text_empty_html() { - let crawler = WebCrawler::new(); - let html = ""; - let result = crawler.extract_text_from_html(html); - assert!(result.is_err()); - } -} diff --git a/src/web_automation/mod.rs b/src/web_automation/mod.rs deleted file mode 100644 index 6cbf7ff87..000000000 --- a/src/web_automation/mod.rs +++ /dev/null @@ -1,229 +0,0 @@ -#[cfg(feature = "web_automation")] - -pub mod crawler; - -use headless_chrome::browser::tab::Tab; -use headless_chrome::{Browser, LaunchOptions}; -use std::env; -use std::error::Error; -use std::future::Future; -use std::path::PathBuf; -use std::pin::Pin; -use std::process::Command; -use std::sync::Arc; -use tokio::fs; -use tokio::sync::Semaphore; - -use crate::shared::utils::{download_file, extract_zip_recursive}; - -pub use crawler::WebCrawler; - -pub struct BrowserSetup { - pub brave_path: String, - pub chromedriver_path: String, -} - -pub struct BrowserPool { - browser: Browser, - semaphore: Semaphore, -} - -impl BrowserPool { - pub async fn new( - max_concurrent: usize, - brave_path: String, - ) -> Result> { - let options = LaunchOptions::default_builder() - .path(Some(PathBuf::from(brave_path))) - .args(vec![ - std::ffi::OsStr::new("--disable-gpu"), - std::ffi::OsStr::new("--no-sandbox"), - std::ffi::OsStr::new("--disable-dev-shm-usage"), - ]) - .build() - .map_err(|e| format!("Failed to build launch options: {}", e))?; - - let browser = - Browser::new(options).map_err(|e| format!("Failed to launch browser: {}", e))?; - - Ok(Self { - browser, - semaphore: Semaphore::new(max_concurrent), - }) - } - - pub async fn with_browser(&self, f: F) -> Result> - where - F: FnOnce( - Arc, - ) - -> Pin>> + Send>> - + Send - + 'static, - T: Send + 'static, - { - let _permit = self.semaphore.acquire().await?; - - let tab = self - .browser - .new_tab() - .map_err(|e| format!("Failed to create new tab: {}", e))?; - - let result = f(tab.clone()).await; - - // Close the tab when done - let _ = tab.close(true); - - result - } -} - -impl BrowserSetup { - pub async fn new() -> Result> { - let brave_path = Self::find_brave().await?; - let chromedriver_path = Self::setup_chromedriver().await?; - - Ok(Self { - brave_path, - chromedriver_path, - }) - } - - async fn find_brave() -> Result> { - let mut possible_paths = vec![ - String::from(r"C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe"), - String::from("/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"), - String::from("/usr/bin/brave-browser"), - String::from("/usr/bin/brave"), - ]; - - if let Ok(local_appdata) = env::var("LOCALAPPDATA") { - let mut path = PathBuf::from(local_appdata); - path.push("BraveSoftware\\Brave-Browser\\Application\\brave.exe"); - possible_paths.push(path.to_string_lossy().to_string()); - } - - for path in possible_paths { - if fs::metadata(&path).await.is_ok() { - return Ok(path); - } - } - - Err("Brave browser not found. Please install Brave first.".into()) - } - - async fn setup_chromedriver() -> Result> { - let mut chromedriver_dir = env::current_exe()?.parent().unwrap().to_path_buf(); - chromedriver_dir.push("chromedriver"); - - if !chromedriver_dir.exists() { - fs::create_dir(&chromedriver_dir).await?; - } - - let chromedriver_path = if cfg!(target_os = "windows") { - chromedriver_dir.join("chromedriver.exe") - } else { - chromedriver_dir.join("chromedriver") - }; - - if fs::metadata(&chromedriver_path).await.is_err() { - let (download_url, platform) = match (cfg!(target_os = "windows"), cfg!(target_arch = "x86_64")) { - (true, true) => ( - "https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/win64/chromedriver-win64.zip", - "win64", - ), - (true, false) => ( - "https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/win32/chromedriver-win32.zip", - "win32", - ), - (false, true) if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") => ( - "https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/mac-arm64/chromedriver-mac-arm64.zip", - "mac-arm64", - ), - (false, true) if cfg!(target_os = "macos") => ( - "https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/mac-x64/chromedriver-mac-x64.zip", - "mac-x64", - ), - (false, true) => ( - "https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/linux64/chromedriver-linux64.zip", - "linux64", - ), - _ => return Err("Unsupported platform".into()), - }; - - let mut zip_path = std::env::temp_dir(); - zip_path.push("chromedriver.zip"); - - download_file(download_url, &zip_path.to_str().unwrap()).await?; - - let mut temp_extract_dir = std::env::temp_dir(); - temp_extract_dir.push("chromedriver_extract"); - let temp_extract_dir1 = temp_extract_dir.clone(); - - let _ = fs::remove_dir_all(&temp_extract_dir).await; - fs::create_dir(&temp_extract_dir).await?; - - extract_zip_recursive(&zip_path, &temp_extract_dir)?; - - let mut extracted_binary_path = temp_extract_dir; - extracted_binary_path.push(format!("chromedriver-{}", platform)); - extracted_binary_path.push(if cfg!(target_os = "windows") { - "chromedriver.exe" - } else { - "chromedriver" - }); - - match fs::rename(&extracted_binary_path, &chromedriver_path).await { - Ok(_) => (), - Err(e) if e.kind() == std::io::ErrorKind::CrossesDevices => { - fs::copy(&extracted_binary_path, &chromedriver_path).await?; - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let mut perms = fs::metadata(&chromedriver_path).await?.permissions(); - perms.set_mode(0o755); - fs::set_permissions(&chromedriver_path, perms).await?; - } - } - Err(e) => return Err(e.into()), - } - - let _ = fs::remove_file(&zip_path).await; - let _ = fs::remove_dir_all(temp_extract_dir1).await; - - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let mut perms = fs::metadata(&chromedriver_path).await?.permissions(); - perms.set_mode(0o755); - fs::set_permissions(&chromedriver_path, perms).await?; - } - } - - Ok(chromedriver_path.to_string_lossy().to_string()) - } -} - -pub async fn initialize_browser_pool() -> Result, Box> { - let setup = BrowserSetup::new().await?; - - // Note: headless_chrome doesn't use chromedriver, it uses Chrome DevTools Protocol directly - // So we don't need to spawn chromedriver process - - Ok(Arc::new(BrowserPool::new(5, setup.brave_path).await?)) -} - -async fn is_process_running(name: &str) -> bool { - if cfg!(target_os = "windows") { - Command::new("tasklist") - .output() - .map(|o| String::from_utf8_lossy(&o.stdout).contains(name)) - .unwrap_or(false) - } else { - Command::new("pgrep") - .arg(name) - .output() - .map(|o| o.status.success()) - .unwrap_or(false) - } -} diff --git a/src/web_automation/web_automation.test.rs b/src/web_automation/web_automation.test.rs deleted file mode 100644 index e0954972c..000000000 --- a/src/web_automation/web_automation.test.rs +++ /dev/null @@ -1,19 +0,0 @@ -//! Tests for web automation module - -#[cfg(test)] -mod tests { - use super::*; - use crate::tests::test_util; - - #[test] - fn test_web_automation_module() { - test_util::setup(); - assert!(true, "Basic web automation module test"); - } - - #[test] - fn test_crawler() { - test_util::setup(); - assert!(true, "Web crawler placeholder test"); - } -}