347 lines
11 KiB
Rust
347 lines
11 KiB
Rust
|
|
use anyhow::Result;
|
||
|
|
use log::{info, trace, warn};
|
||
|
|
use serde::{Deserialize, Serialize};
|
||
|
|
use std::collections::HashSet;
|
||
|
|
use std::time::Duration;
|
||
|
|
use tokio::time::sleep;
|
||
|
|
|
||
|
|
/// Website crawl configuration
|
||
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
|
|
pub struct WebsiteCrawlConfig {
|
||
|
|
pub url: String,
|
||
|
|
pub max_depth: usize,
|
||
|
|
pub max_pages: usize,
|
||
|
|
pub crawl_delay_ms: u64,
|
||
|
|
pub expires_policy: String,
|
||
|
|
pub last_crawled: Option<chrono::DateTime<chrono::Utc>>,
|
||
|
|
pub next_crawl: Option<chrono::DateTime<chrono::Utc>>,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl WebsiteCrawlConfig {
|
||
|
|
/// Parse expiration policy and calculate next crawl time
|
||
|
|
pub fn calculate_next_crawl(&mut self) {
|
||
|
|
let now = chrono::Utc::now();
|
||
|
|
self.last_crawled = Some(now);
|
||
|
|
|
||
|
|
let duration = match self.expires_policy.as_str() {
|
||
|
|
"1h" => chrono::Duration::hours(1),
|
||
|
|
"6h" => chrono::Duration::hours(6),
|
||
|
|
"12h" => chrono::Duration::hours(12),
|
||
|
|
"1d" | "24h" => chrono::Duration::days(1),
|
||
|
|
"3d" => chrono::Duration::days(3),
|
||
|
|
"1w" | "7d" => chrono::Duration::weeks(1),
|
||
|
|
"2w" => chrono::Duration::weeks(2),
|
||
|
|
"1m" | "30d" => chrono::Duration::days(30),
|
||
|
|
"3m" => chrono::Duration::days(90),
|
||
|
|
"6m" => chrono::Duration::days(180),
|
||
|
|
"1y" | "365d" => chrono::Duration::days(365),
|
||
|
|
custom => {
|
||
|
|
// Simple parsing for custom format like "2h", "5d", etc.
|
||
|
|
if custom.ends_with('h') {
|
||
|
|
if let Ok(hours) = custom[..custom.len() - 1].parse::<i64>() {
|
||
|
|
chrono::Duration::hours(hours)
|
||
|
|
} else {
|
||
|
|
chrono::Duration::days(1)
|
||
|
|
}
|
||
|
|
} else if custom.ends_with('d') {
|
||
|
|
if let Ok(days) = custom[..custom.len() - 1].parse::<i64>() {
|
||
|
|
chrono::Duration::days(days)
|
||
|
|
} else {
|
||
|
|
chrono::Duration::days(1)
|
||
|
|
}
|
||
|
|
} else if custom.ends_with('w') {
|
||
|
|
if let Ok(weeks) = custom[..custom.len() - 1].parse::<i64>() {
|
||
|
|
chrono::Duration::weeks(weeks)
|
||
|
|
} else {
|
||
|
|
chrono::Duration::days(1)
|
||
|
|
}
|
||
|
|
} else if custom.ends_with('m') {
|
||
|
|
if let Ok(months) = custom[..custom.len() - 1].parse::<i64>() {
|
||
|
|
chrono::Duration::days(months * 30)
|
||
|
|
} else {
|
||
|
|
chrono::Duration::days(1)
|
||
|
|
}
|
||
|
|
} else if custom.ends_with('y') {
|
||
|
|
if let Ok(years) = custom[..custom.len() - 1].parse::<i64>() {
|
||
|
|
chrono::Duration::days(years * 365)
|
||
|
|
} else {
|
||
|
|
chrono::Duration::days(1)
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
chrono::Duration::days(1) // Default to daily if unparseable
|
||
|
|
}
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
self.next_crawl = Some(now + duration);
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Check if website needs recrawling
|
||
|
|
pub fn needs_crawl(&self) -> bool {
|
||
|
|
match self.next_crawl {
|
||
|
|
Some(next) => chrono::Utc::now() >= next,
|
||
|
|
None => true, // Never crawled
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Website content for indexing
|
||
|
|
#[derive(Debug, Clone)]
|
||
|
|
pub struct WebPage {
|
||
|
|
pub url: String,
|
||
|
|
pub title: Option<String>,
|
||
|
|
pub content: String,
|
||
|
|
pub meta_description: Option<String>,
|
||
|
|
pub crawled_at: chrono::DateTime<chrono::Utc>,
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Web crawler for website content
|
||
|
|
pub struct WebCrawler {
|
||
|
|
client: reqwest::Client,
|
||
|
|
config: WebsiteCrawlConfig,
|
||
|
|
visited_urls: HashSet<String>,
|
||
|
|
pages: Vec<WebPage>,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl WebCrawler {
|
||
|
|
pub fn new(config: WebsiteCrawlConfig) -> Self {
|
||
|
|
let client = reqwest::Client::builder()
|
||
|
|
.timeout(Duration::from_secs(30))
|
||
|
|
.user_agent("GeneralBots/1.0 (Knowledge Base Crawler)")
|
||
|
|
.build()
|
||
|
|
.unwrap_or_default();
|
||
|
|
|
||
|
|
Self {
|
||
|
|
client,
|
||
|
|
config,
|
||
|
|
visited_urls: HashSet::new(),
|
||
|
|
pages: Vec::new(),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Crawl website starting from configured URL
|
||
|
|
pub async fn crawl(&mut self) -> Result<Vec<WebPage>> {
|
||
|
|
info!("Starting crawl of website: {}", self.config.url);
|
||
|
|
|
||
|
|
// Start crawling from root URL
|
||
|
|
self.crawl_recursive(&self.config.url.clone(), 0).await?;
|
||
|
|
|
||
|
|
info!(
|
||
|
|
"Crawled {} pages from {}",
|
||
|
|
self.pages.len(),
|
||
|
|
self.config.url
|
||
|
|
);
|
||
|
|
|
||
|
|
Ok(self.pages.clone())
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Recursive crawling with depth control
|
||
|
|
async fn crawl_recursive(&mut self, url: &str, depth: usize) -> Result<()> {
|
||
|
|
// Check depth limit
|
||
|
|
if depth > self.config.max_depth {
|
||
|
|
trace!(
|
||
|
|
"Reached max depth {} for URL: {}",
|
||
|
|
self.config.max_depth,
|
||
|
|
url
|
||
|
|
);
|
||
|
|
return Ok(());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check page limit
|
||
|
|
if self.pages.len() >= self.config.max_pages {
|
||
|
|
trace!("Reached max pages limit: {}", self.config.max_pages);
|
||
|
|
return Ok(());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check if already visited
|
||
|
|
if self.visited_urls.contains(url) {
|
||
|
|
return Ok(());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Mark as visited
|
||
|
|
self.visited_urls.insert(url.to_string());
|
||
|
|
|
||
|
|
// Add crawl delay to be polite
|
||
|
|
if !self.visited_urls.is_empty() {
|
||
|
|
sleep(Duration::from_millis(self.config.crawl_delay_ms)).await;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fetch page
|
||
|
|
let response = match self.client.get(url).send().await {
|
||
|
|
Ok(resp) => resp,
|
||
|
|
Err(e) => {
|
||
|
|
warn!("Failed to fetch {}: {}", url, e);
|
||
|
|
return Ok(()); // Continue crawling other pages
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
// Check if HTML
|
||
|
|
let content_type = response
|
||
|
|
.headers()
|
||
|
|
.get("content-type")
|
||
|
|
.and_then(|v| v.to_str().ok())
|
||
|
|
.unwrap_or("");
|
||
|
|
|
||
|
|
if !content_type.contains("text/html") {
|
||
|
|
trace!("Skipping non-HTML content: {}", url);
|
||
|
|
return Ok(());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get page content
|
||
|
|
let html_text = match response.text().await {
|
||
|
|
Ok(text) => text,
|
||
|
|
Err(e) => {
|
||
|
|
warn!("Failed to read response from {}: {}", url, e);
|
||
|
|
return Ok(());
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
// Extract page content
|
||
|
|
let page = self.extract_page_content(&html_text, url);
|
||
|
|
self.pages.push(page);
|
||
|
|
|
||
|
|
// Extract and crawl links if not at max depth
|
||
|
|
if depth < self.config.max_depth {
|
||
|
|
let links = self.extract_links(&html_text, url);
|
||
|
|
for link in links {
|
||
|
|
// Only crawl same domain
|
||
|
|
if self.is_same_domain(url, &link) {
|
||
|
|
Box::pin(self.crawl_recursive(&link, depth + 1)).await?;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
Ok(())
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Extract text content from HTML
|
||
|
|
fn extract_page_content(&self, html: &str, url: &str) -> WebPage {
|
||
|
|
// Simple HTML tag removal
|
||
|
|
let mut text = html.to_string();
|
||
|
|
|
||
|
|
// Remove script and style tags with their content
|
||
|
|
while let Some(start) = text.find("<script") {
|
||
|
|
if let Some(end) = text.find("</script>") {
|
||
|
|
text.replace_range(start..=end + 8, " ");
|
||
|
|
} else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
while let Some(start) = text.find("<style") {
|
||
|
|
if let Some(end) = text.find("</style>") {
|
||
|
|
text.replace_range(start..=end + 7, " ");
|
||
|
|
} else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract title if present
|
||
|
|
let title = if let Some(title_start) = text.find("<title>") {
|
||
|
|
if let Some(title_end) = text.find("</title>") {
|
||
|
|
Some(text[title_start + 7..title_end].to_string())
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
};
|
||
|
|
|
||
|
|
// Remove all remaining HTML tags
|
||
|
|
while let Some(start) = text.find('<') {
|
||
|
|
if let Some(end) = text.find('>') {
|
||
|
|
if end > start {
|
||
|
|
text.replace_range(start..=end, " ");
|
||
|
|
} else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Clean up whitespace
|
||
|
|
let content = text.split_whitespace().collect::<Vec<_>>().join(" ");
|
||
|
|
|
||
|
|
WebPage {
|
||
|
|
url: url.to_string(),
|
||
|
|
title,
|
||
|
|
content,
|
||
|
|
meta_description: None,
|
||
|
|
crawled_at: chrono::Utc::now(),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Extract links from HTML
|
||
|
|
fn extract_links(&self, html: &str, base_url: &str) -> Vec<String> {
|
||
|
|
let mut links = Vec::new();
|
||
|
|
let mut search_pos = 0;
|
||
|
|
|
||
|
|
// Simple href extraction
|
||
|
|
while let Some(href_pos) = html[search_pos..].find("href=\"") {
|
||
|
|
let href_start = search_pos + href_pos + 6;
|
||
|
|
if let Some(href_end) = html[href_start..].find('"') {
|
||
|
|
let href = &html[href_start..href_start + href_end];
|
||
|
|
|
||
|
|
// Skip anchors, javascript, mailto, etc.
|
||
|
|
if !href.starts_with('#')
|
||
|
|
&& !href.starts_with("javascript:")
|
||
|
|
&& !href.starts_with("mailto:")
|
||
|
|
&& !href.starts_with("tel:")
|
||
|
|
{
|
||
|
|
// Convert relative URLs to absolute
|
||
|
|
let absolute_url =
|
||
|
|
if href.starts_with("http://") || href.starts_with("https://") {
|
||
|
|
href.to_string()
|
||
|
|
} else if href.starts_with('/') {
|
||
|
|
// Get base domain from base_url
|
||
|
|
if let Some(domain_end) = base_url[8..].find('/') {
|
||
|
|
format!("{}{}", &base_url[..8 + domain_end], href)
|
||
|
|
} else {
|
||
|
|
format!("{}{}", base_url, href)
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// Relative to current page
|
||
|
|
if let Some(last_slash) = base_url.rfind('/') {
|
||
|
|
format!("{}/{}", &base_url[..last_slash], href)
|
||
|
|
} else {
|
||
|
|
format!("{}/{}", base_url, href)
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
links.push(absolute_url);
|
||
|
|
}
|
||
|
|
search_pos = href_start + href_end;
|
||
|
|
} else {
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
links
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Check if two URLs are from the same domain
|
||
|
|
fn is_same_domain(&self, url1: &str, url2: &str) -> bool {
|
||
|
|
let domain1 = self.extract_domain(url1);
|
||
|
|
let domain2 = self.extract_domain(url2);
|
||
|
|
domain1 == domain2
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Extract domain from URL
|
||
|
|
fn extract_domain(&self, url: &str) -> String {
|
||
|
|
let without_protocol = if url.starts_with("https://") {
|
||
|
|
&url[8..]
|
||
|
|
} else if url.starts_with("http://") {
|
||
|
|
&url[7..]
|
||
|
|
} else {
|
||
|
|
url
|
||
|
|
};
|
||
|
|
|
||
|
|
if let Some(slash_pos) = without_protocol.find('/') {
|
||
|
|
without_protocol[..slash_pos].to_string()
|
||
|
|
} else {
|
||
|
|
without_protocol.to_string()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|