227 lines
7.8 KiB
Rust
227 lines
7.8 KiB
Rust
pub mod crawler;
|
|
|
|
use headless_chrome::browser::tab::Tab;
|
|
use headless_chrome::{Browser, LaunchOptions};
|
|
use std::env;
|
|
use std::error::Error;
|
|
use std::future::Future;
|
|
use std::path::PathBuf;
|
|
use std::pin::Pin;
|
|
use std::process::Command;
|
|
use std::sync::Arc;
|
|
use tokio::fs;
|
|
use tokio::sync::Semaphore;
|
|
|
|
use crate::shared::utils::{download_file, extract_zip_recursive};
|
|
|
|
pub use crawler::WebCrawler;
|
|
|
|
pub struct BrowserSetup {
|
|
pub brave_path: String,
|
|
pub chromedriver_path: String,
|
|
}
|
|
|
|
pub struct BrowserPool {
|
|
browser: Browser,
|
|
semaphore: Semaphore,
|
|
}
|
|
|
|
impl BrowserPool {
|
|
pub async fn new(
|
|
max_concurrent: usize,
|
|
brave_path: String,
|
|
) -> Result<Self, Box<dyn Error + Send + Sync>> {
|
|
let options = LaunchOptions::default_builder()
|
|
.path(Some(PathBuf::from(brave_path)))
|
|
.args(vec![
|
|
std::ffi::OsStr::new("--disable-gpu"),
|
|
std::ffi::OsStr::new("--no-sandbox"),
|
|
std::ffi::OsStr::new("--disable-dev-shm-usage"),
|
|
])
|
|
.build()
|
|
.map_err(|e| format!("Failed to build launch options: {}", e))?;
|
|
|
|
let browser =
|
|
Browser::new(options).map_err(|e| format!("Failed to launch browser: {}", e))?;
|
|
|
|
Ok(Self {
|
|
browser,
|
|
semaphore: Semaphore::new(max_concurrent),
|
|
})
|
|
}
|
|
|
|
pub async fn with_browser<F, T>(&self, f: F) -> Result<T, Box<dyn Error + Send + Sync>>
|
|
where
|
|
F: FnOnce(
|
|
Arc<Tab>,
|
|
)
|
|
-> Pin<Box<dyn Future<Output = Result<T, Box<dyn Error + Send + Sync>>> + Send>>
|
|
+ Send
|
|
+ 'static,
|
|
T: Send + 'static,
|
|
{
|
|
let _permit = self.semaphore.acquire().await?;
|
|
|
|
let tab = self
|
|
.browser
|
|
.new_tab()
|
|
.map_err(|e| format!("Failed to create new tab: {}", e))?;
|
|
|
|
let result = f(tab.clone()).await;
|
|
|
|
// Close the tab when done
|
|
let _ = tab.close(true);
|
|
|
|
result
|
|
}
|
|
}
|
|
|
|
impl BrowserSetup {
|
|
pub async fn new() -> Result<Self, Box<dyn std::error::Error>> {
|
|
let brave_path = Self::find_brave().await?;
|
|
let chromedriver_path = Self::setup_chromedriver().await?;
|
|
|
|
Ok(Self {
|
|
brave_path,
|
|
chromedriver_path,
|
|
})
|
|
}
|
|
|
|
async fn find_brave() -> Result<String, Box<dyn std::error::Error>> {
|
|
let mut possible_paths = vec![
|
|
String::from(r"C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe"),
|
|
String::from("/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"),
|
|
String::from("/usr/bin/brave-browser"),
|
|
String::from("/usr/bin/brave"),
|
|
];
|
|
|
|
if let Ok(local_appdata) = env::var("LOCALAPPDATA") {
|
|
let mut path = PathBuf::from(local_appdata);
|
|
path.push("BraveSoftware\\Brave-Browser\\Application\\brave.exe");
|
|
possible_paths.push(path.to_string_lossy().to_string());
|
|
}
|
|
|
|
for path in possible_paths {
|
|
if fs::metadata(&path).await.is_ok() {
|
|
return Ok(path);
|
|
}
|
|
}
|
|
|
|
Err("Brave browser not found. Please install Brave first.".into())
|
|
}
|
|
|
|
async fn setup_chromedriver() -> Result<String, Box<dyn std::error::Error>> {
|
|
let mut chromedriver_dir = env::current_exe()?.parent().unwrap().to_path_buf();
|
|
chromedriver_dir.push("chromedriver");
|
|
|
|
if !chromedriver_dir.exists() {
|
|
fs::create_dir(&chromedriver_dir).await?;
|
|
}
|
|
|
|
let chromedriver_path = if cfg!(target_os = "windows") {
|
|
chromedriver_dir.join("chromedriver.exe")
|
|
} else {
|
|
chromedriver_dir.join("chromedriver")
|
|
};
|
|
|
|
if fs::metadata(&chromedriver_path).await.is_err() {
|
|
let (download_url, platform) = match (cfg!(target_os = "windows"), cfg!(target_arch = "x86_64")) {
|
|
(true, true) => (
|
|
"https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/win64/chromedriver-win64.zip",
|
|
"win64",
|
|
),
|
|
(true, false) => (
|
|
"https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/win32/chromedriver-win32.zip",
|
|
"win32",
|
|
),
|
|
(false, true) if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") => (
|
|
"https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/mac-arm64/chromedriver-mac-arm64.zip",
|
|
"mac-arm64",
|
|
),
|
|
(false, true) if cfg!(target_os = "macos") => (
|
|
"https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/mac-x64/chromedriver-mac-x64.zip",
|
|
"mac-x64",
|
|
),
|
|
(false, true) => (
|
|
"https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.183/linux64/chromedriver-linux64.zip",
|
|
"linux64",
|
|
),
|
|
_ => return Err("Unsupported platform".into()),
|
|
};
|
|
|
|
let mut zip_path = std::env::temp_dir();
|
|
zip_path.push("chromedriver.zip");
|
|
|
|
download_file(download_url, &zip_path.to_str().unwrap()).await?;
|
|
|
|
let mut temp_extract_dir = std::env::temp_dir();
|
|
temp_extract_dir.push("chromedriver_extract");
|
|
let temp_extract_dir1 = temp_extract_dir.clone();
|
|
|
|
let _ = fs::remove_dir_all(&temp_extract_dir).await;
|
|
fs::create_dir(&temp_extract_dir).await?;
|
|
|
|
extract_zip_recursive(&zip_path, &temp_extract_dir)?;
|
|
|
|
let mut extracted_binary_path = temp_extract_dir;
|
|
extracted_binary_path.push(format!("chromedriver-{}", platform));
|
|
extracted_binary_path.push(if cfg!(target_os = "windows") {
|
|
"chromedriver.exe"
|
|
} else {
|
|
"chromedriver"
|
|
});
|
|
|
|
match fs::rename(&extracted_binary_path, &chromedriver_path).await {
|
|
Ok(_) => (),
|
|
Err(e) if e.kind() == std::io::ErrorKind::CrossesDevices => {
|
|
fs::copy(&extracted_binary_path, &chromedriver_path).await?;
|
|
#[cfg(unix)]
|
|
{
|
|
use std::os::unix::fs::PermissionsExt;
|
|
let mut perms = fs::metadata(&chromedriver_path).await?.permissions();
|
|
perms.set_mode(0o755);
|
|
fs::set_permissions(&chromedriver_path, perms).await?;
|
|
}
|
|
}
|
|
Err(e) => return Err(e.into()),
|
|
}
|
|
|
|
let _ = fs::remove_file(&zip_path).await;
|
|
let _ = fs::remove_dir_all(temp_extract_dir1).await;
|
|
|
|
#[cfg(unix)]
|
|
{
|
|
use std::os::unix::fs::PermissionsExt;
|
|
let mut perms = fs::metadata(&chromedriver_path).await?.permissions();
|
|
perms.set_mode(0o755);
|
|
fs::set_permissions(&chromedriver_path, perms).await?;
|
|
}
|
|
}
|
|
|
|
Ok(chromedriver_path.to_string_lossy().to_string())
|
|
}
|
|
}
|
|
|
|
pub async fn initialize_browser_pool() -> Result<Arc<BrowserPool>, Box<dyn std::error::Error>> {
|
|
let setup = BrowserSetup::new().await?;
|
|
|
|
// Note: headless_chrome doesn't use chromedriver, it uses Chrome DevTools Protocol directly
|
|
// So we don't need to spawn chromedriver process
|
|
|
|
Ok(Arc::new(BrowserPool::new(5, setup.brave_path).await?))
|
|
}
|
|
|
|
async fn is_process_running(name: &str) -> bool {
|
|
if cfg!(target_os = "windows") {
|
|
Command::new("tasklist")
|
|
.output()
|
|
.map(|o| String::from_utf8_lossy(&o.stdout).contains(name))
|
|
.unwrap_or(false)
|
|
} else {
|
|
Command::new("pgrep")
|
|
.arg(name)
|
|
.output()
|
|
.map(|o| o.status.success())
|
|
.unwrap_or(false)
|
|
}
|
|
}
|