Fix issues: remove unused import, fix ownership error, reduce crawler interval
This commit is contained in:
parent
94fede7cc4
commit
748fceff5d
4 changed files with 359 additions and 21 deletions
3
migrations/core/6.2.1/down.sql
Normal file
3
migrations/core/6.2.1/down.sql
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
-- Remove the refresh_policy column from website_crawls table
|
||||||
|
ALTER TABLE website_crawls
|
||||||
|
DROP COLUMN IF EXISTS refresh_policy;
|
||||||
13
migrations/core/6.2.1/up.sql
Normal file
13
migrations/core/6.2.1/up.sql
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
-- Add refresh_policy column to website_crawls table
|
||||||
|
-- This column stores the user-configured refresh interval (e.g., "1d", "1w", "1m", "1y")
|
||||||
|
|
||||||
|
ALTER TABLE website_crawls
|
||||||
|
ADD COLUMN IF NOT EXISTS refresh_policy VARCHAR(20);
|
||||||
|
|
||||||
|
-- Update existing records to have a default refresh policy (1 month)
|
||||||
|
UPDATE website_crawls
|
||||||
|
SET refresh_policy = '1m'
|
||||||
|
WHERE refresh_policy IS NULL;
|
||||||
|
|
||||||
|
-- Add comment for documentation
|
||||||
|
COMMENT ON COLUMN website_crawls.refresh_policy IS 'User-configured refresh interval (e.g., "1d", "1w", "1m", "1y") - shortest interval is used when duplicates exist';
|
||||||
|
|
@ -6,21 +6,64 @@ use rhai::{Dynamic, Engine};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
/// Parse refresh interval string (e.g., "1d", "1w", "1m", "1y") into days
|
||||||
|
/// Returns the number of days for the refresh interval
|
||||||
|
fn parse_refresh_interval(interval: &str) -> Result<i32, String> {
|
||||||
|
let interval_lower = interval.trim().to_lowercase();
|
||||||
|
|
||||||
|
// Match patterns like "1d", "7d", "2w", "1m", "1y", etc.
|
||||||
|
if interval_lower.ends_with('d') {
|
||||||
|
let days: i32 = interval_lower[..interval_lower.len()-1]
|
||||||
|
.parse()
|
||||||
|
.map_err(|_| format!("Invalid days format: {}", interval))?;
|
||||||
|
Ok(days)
|
||||||
|
} else if interval_lower.ends_with('w') {
|
||||||
|
let weeks: i32 = interval_lower[..interval_lower.len()-1]
|
||||||
|
.parse()
|
||||||
|
.map_err(|_| format!("Invalid weeks format: {}", interval))?;
|
||||||
|
Ok(weeks * 7)
|
||||||
|
} else if interval_lower.ends_with('m') {
|
||||||
|
let months: i32 = interval_lower[..interval_lower.len()-1]
|
||||||
|
.parse()
|
||||||
|
.map_err(|_| format!("Invalid months format: {}", interval))?;
|
||||||
|
Ok(months * 30) // Approximate month as 30 days
|
||||||
|
} else if interval_lower.ends_with('y') {
|
||||||
|
let years: i32 = interval_lower[..interval_lower.len()-1]
|
||||||
|
.parse()
|
||||||
|
.map_err(|_| format!("Invalid years format: {}", interval))?;
|
||||||
|
Ok(years * 365) // Approximate year as 365 days
|
||||||
|
} else {
|
||||||
|
// Try to parse as plain number (assume days)
|
||||||
|
interval.parse()
|
||||||
|
.map_err(|_| format!("Invalid refresh interval format: {}. Use format like '1d', '1w', '1m', '1y'", interval))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert days to expires_policy string format
|
||||||
|
fn days_to_expires_policy(days: i32) -> String {
|
||||||
|
format!("{}d", days)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn use_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
|
pub fn use_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
|
||||||
let state_clone = Arc::clone(&state);
|
let state_clone = Arc::clone(&state);
|
||||||
let user_clone = user;
|
let user_clone = user.clone();
|
||||||
|
|
||||||
|
// Register syntax for USE WEBSITE "url" REFRESH "interval"
|
||||||
engine
|
engine
|
||||||
.register_custom_syntax(
|
.register_custom_syntax(
|
||||||
["USE", "WEBSITE", "$expr$"],
|
["USE", "WEBSITE", "$expr$", "REFRESH", "$expr$"],
|
||||||
false,
|
false,
|
||||||
move |context, inputs| {
|
move |context, inputs| {
|
||||||
let url = context.eval_expression_tree(&inputs[0])?;
|
let url = context.eval_expression_tree(&inputs[0])?;
|
||||||
let url_str = url.to_string().trim_matches('"').to_string();
|
let url_str = url.to_string().trim_matches('"').to_string();
|
||||||
|
|
||||||
|
let refresh = context.eval_expression_tree(&inputs[1])?;
|
||||||
|
let refresh_str = refresh.to_string().trim_matches('"').to_string();
|
||||||
|
|
||||||
trace!(
|
trace!(
|
||||||
"USE WEBSITE command executed: {} for session: {}",
|
"USE WEBSITE command executed: {} REFRESH {} for session: {}",
|
||||||
url_str,
|
url_str,
|
||||||
|
refresh_str,
|
||||||
user_clone.id
|
user_clone.id
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
@ -35,6 +78,83 @@ pub fn use_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut
|
||||||
let state_for_task = Arc::clone(&state_clone);
|
let state_for_task = Arc::clone(&state_clone);
|
||||||
let user_for_task = user_clone.clone();
|
let user_for_task = user_clone.clone();
|
||||||
let url_for_task = url_str;
|
let url_for_task = url_str;
|
||||||
|
let refresh_for_task = refresh_str;
|
||||||
|
let (tx, rx) = std::sync::mpsc::channel();
|
||||||
|
|
||||||
|
std::thread::spawn(move || {
|
||||||
|
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.worker_threads(2)
|
||||||
|
.enable_all()
|
||||||
|
.build();
|
||||||
|
|
||||||
|
let send_err = if let Ok(_rt) = rt {
|
||||||
|
let result = associate_website_with_session_refresh(
|
||||||
|
&state_for_task,
|
||||||
|
&user_for_task,
|
||||||
|
&url_for_task,
|
||||||
|
&refresh_for_task,
|
||||||
|
);
|
||||||
|
tx.send(result).err()
|
||||||
|
} else {
|
||||||
|
tx.send(Err("Failed to build tokio runtime".to_string()))
|
||||||
|
.err()
|
||||||
|
};
|
||||||
|
|
||||||
|
if send_err.is_some() {
|
||||||
|
error!("Failed to send result from thread");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
match rx.recv_timeout(std::time::Duration::from_secs(10)) {
|
||||||
|
Ok(Ok(message)) => Ok(Dynamic::from(message)),
|
||||||
|
Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
|
||||||
|
e.into(),
|
||||||
|
rhai::Position::NONE,
|
||||||
|
))),
|
||||||
|
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
|
||||||
|
Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
|
||||||
|
"USE WEBSITE timed out".into(),
|
||||||
|
rhai::Position::NONE,
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
|
||||||
|
format!("USE WEBSITE failed: {}", e).into(),
|
||||||
|
rhai::Position::NONE,
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.expect("valid syntax registration");
|
||||||
|
|
||||||
|
// Register syntax for USE WEBSITE "url" (without REFRESH)
|
||||||
|
let state_clone2 = Arc::clone(&state);
|
||||||
|
let user_clone2 = user.clone();
|
||||||
|
|
||||||
|
engine
|
||||||
|
.register_custom_syntax(
|
||||||
|
["USE", "WEBSITE", "$expr$"],
|
||||||
|
false,
|
||||||
|
move |context, inputs| {
|
||||||
|
let url = context.eval_expression_tree(&inputs[0])?;
|
||||||
|
let url_str = url.to_string().trim_matches('"').to_string();
|
||||||
|
|
||||||
|
trace!(
|
||||||
|
"USE WEBSITE command executed: {} for session: {}",
|
||||||
|
url_str,
|
||||||
|
user_clone2.id
|
||||||
|
);
|
||||||
|
|
||||||
|
let is_valid = url_str.starts_with("http://") || url_str.starts_with("https://");
|
||||||
|
if !is_valid {
|
||||||
|
return Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
|
||||||
|
"Invalid URL format. Must start with http:// or https://".into(),
|
||||||
|
rhai::Position::NONE,
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let state_for_task = Arc::clone(&state_clone2);
|
||||||
|
let user_for_task = user_clone2.clone();
|
||||||
|
let url_for_task = url_str;
|
||||||
let (tx, rx) = std::sync::mpsc::channel();
|
let (tx, rx) = std::sync::mpsc::channel();
|
||||||
|
|
||||||
std::thread::spawn(move || {
|
std::thread::spawn(move || {
|
||||||
|
|
@ -87,7 +207,16 @@ fn associate_website_with_session(
|
||||||
user: &UserSession,
|
user: &UserSession,
|
||||||
url: &str,
|
url: &str,
|
||||||
) -> Result<String, String> {
|
) -> Result<String, String> {
|
||||||
info!("Associating website {} with session {}", url, user.id);
|
associate_website_with_session_refresh(state, user, url, "1m") // Default: 1 month
|
||||||
|
}
|
||||||
|
|
||||||
|
fn associate_website_with_session_refresh(
|
||||||
|
state: &AppState,
|
||||||
|
user: &UserSession,
|
||||||
|
url: &str,
|
||||||
|
refresh_interval: &str,
|
||||||
|
) -> Result<String, String> {
|
||||||
|
info!("Associating website {} with session {} (refresh: {})", url, user.id, refresh_interval);
|
||||||
|
|
||||||
let mut conn = state.conn.get().map_err(|e| format!("DB error: {}", e))?;
|
let mut conn = state.conn.get().map_err(|e| format!("DB error: {}", e))?;
|
||||||
|
|
||||||
|
|
@ -97,16 +226,25 @@ fn associate_website_with_session(
|
||||||
|
|
||||||
match website_status {
|
match website_status {
|
||||||
WebsiteCrawlStatus::NotRegistered => {
|
WebsiteCrawlStatus::NotRegistered => {
|
||||||
return Err(format!(
|
// Auto-register website for crawling instead of failing
|
||||||
"Website {} has not been registered for crawling. It should be added to the script for preprocessing.",
|
info!("Website {} not registered, auto-registering for crawling with refresh: {}", url, refresh_interval);
|
||||||
url
|
register_website_for_crawling_with_refresh(&mut conn, &user.bot_id, url, refresh_interval)
|
||||||
|
.map_err(|e| format!("Failed to register website: {}", e))?;
|
||||||
|
|
||||||
|
return Ok(format!(
|
||||||
|
"Website {} has been registered for crawling (refresh: {}). It will be available once crawling completes.",
|
||||||
|
url, refresh_interval
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
WebsiteCrawlStatus::Pending => {
|
WebsiteCrawlStatus::Pending => {
|
||||||
info!("Website {} is pending crawl, associating anyway", url);
|
info!("Website {} is pending crawl, associating anyway", url);
|
||||||
|
// Update refresh policy if needed
|
||||||
|
update_refresh_policy_if_shorter(&mut conn, &user.bot_id, url, refresh_interval)?;
|
||||||
}
|
}
|
||||||
WebsiteCrawlStatus::Crawled => {
|
WebsiteCrawlStatus::Crawled => {
|
||||||
info!("Website {} is already crawled and ready", url);
|
info!("Website {} is already crawled and ready", url);
|
||||||
|
// Update refresh policy if needed
|
||||||
|
update_refresh_policy_if_shorter(&mut conn, &user.bot_id, url, refresh_interval)?;
|
||||||
}
|
}
|
||||||
WebsiteCrawlStatus::Failed => {
|
WebsiteCrawlStatus::Failed => {
|
||||||
return Err(format!(
|
return Err(format!(
|
||||||
|
|
@ -165,26 +303,96 @@ pub fn register_website_for_crawling(
|
||||||
bot_id: &Uuid,
|
bot_id: &Uuid,
|
||||||
url: &str,
|
url: &str,
|
||||||
) -> Result<(), String> {
|
) -> Result<(), String> {
|
||||||
let expires_policy = "1d";
|
register_website_for_crawling_with_refresh(conn, bot_id, url, "1m") // Default: 1 month
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn register_website_for_crawling_with_refresh(
|
||||||
|
conn: &mut PgConnection,
|
||||||
|
bot_id: &Uuid,
|
||||||
|
url: &str,
|
||||||
|
refresh_interval: &str,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
let days = parse_refresh_interval(refresh_interval)
|
||||||
|
.map_err(|e| format!("Invalid refresh interval: {}", e))?;
|
||||||
|
|
||||||
|
let expires_policy = days_to_expires_policy(days);
|
||||||
|
|
||||||
let query = diesel::sql_query(
|
let query = diesel::sql_query(
|
||||||
"INSERT INTO website_crawls (id, bot_id, url, expires_policy, crawl_status, next_crawl)
|
"INSERT INTO website_crawls (id, bot_id, url, expires_policy, crawl_status, next_crawl, refresh_policy)
|
||||||
VALUES (gen_random_uuid(), $1, $2, $3, 0, NOW())
|
VALUES (gen_random_uuid(), $1, $2, $3, 0, NOW(), $4)
|
||||||
ON CONFLICT (bot_id, url) DO UPDATE SET next_crawl =
|
ON CONFLICT (bot_id, url) DO UPDATE SET
|
||||||
CASE
|
next_crawl = CASE
|
||||||
WHEN website_crawls.crawl_status = 2 THEN NOW() -- Failed, retry now
|
WHEN website_crawls.crawl_status = 2 THEN NOW() -- Failed, retry now
|
||||||
ELSE website_crawls.next_crawl -- Keep existing schedule
|
ELSE website_crawls.next_crawl -- Keep existing schedule
|
||||||
END",
|
END,
|
||||||
|
refresh_policy = CASE
|
||||||
|
WHEN website_crawls.refresh_policy IS NULL THEN $4
|
||||||
|
ELSE LEAST(website_crawls.refresh_policy, $4) -- Use shorter interval
|
||||||
|
END",
|
||||||
)
|
)
|
||||||
.bind::<diesel::sql_types::Uuid, _>(bot_id)
|
.bind::<diesel::sql_types::Uuid, _>(bot_id)
|
||||||
.bind::<diesel::sql_types::Text, _>(url)
|
.bind::<diesel::sql_types::Text, _>(url)
|
||||||
.bind::<diesel::sql_types::Text, _>(expires_policy);
|
.bind::<diesel::sql_types::Text, _>(expires_policy)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(refresh_interval);
|
||||||
|
|
||||||
query
|
query
|
||||||
.execute(conn)
|
.execute(conn)
|
||||||
.map_err(|e| format!("Failed to register website for crawling: {}", e))?;
|
.map_err(|e| format!("Failed to register website for crawling: {}", e))?;
|
||||||
|
|
||||||
info!("Website {} registered for crawling for bot {}", url, bot_id);
|
info!("Website {} registered for crawling for bot {} with refresh policy: {}", url, bot_id, refresh_interval);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update refresh policy if the new interval is shorter than the existing one
|
||||||
|
fn update_refresh_policy_if_shorter(
|
||||||
|
conn: &mut PgConnection,
|
||||||
|
bot_id: &Uuid,
|
||||||
|
url: &str,
|
||||||
|
refresh_interval: &str,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
// Get current record to compare in Rust (no SQL business logic!)
|
||||||
|
#[derive(QueryableByName)]
|
||||||
|
struct CurrentRefresh {
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
||||||
|
refresh_policy: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
let current = diesel::sql_query(
|
||||||
|
"SELECT refresh_policy FROM website_crawls WHERE bot_id = $1 AND url = $2"
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Uuid, _>(bot_id)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(url)
|
||||||
|
.get_result::<CurrentRefresh>(conn)
|
||||||
|
.ok();
|
||||||
|
|
||||||
|
let new_days = parse_refresh_interval(refresh_interval)
|
||||||
|
.map_err(|e| format!("Invalid refresh interval: {}", e))?;
|
||||||
|
|
||||||
|
// Check if we should update (no policy exists or new interval is shorter)
|
||||||
|
let should_update = match ¤t {
|
||||||
|
Some(c) if c.refresh_policy.is_some() => {
|
||||||
|
let existing_days = parse_refresh_interval(c.refresh_policy.as_ref().unwrap())
|
||||||
|
.unwrap_or(i32::MAX);
|
||||||
|
new_days < existing_days
|
||||||
|
}
|
||||||
|
_ => true, // No existing policy, so update
|
||||||
|
};
|
||||||
|
|
||||||
|
if should_update {
|
||||||
|
let expires_policy = days_to_expires_policy(new_days);
|
||||||
|
|
||||||
|
diesel::sql_query(
|
||||||
|
"UPDATE website_crawls SET refresh_policy = $3, expires_policy = $4
|
||||||
|
WHERE bot_id = $1 AND url = $2"
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Uuid, _>(bot_id)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(url)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(refresh_interval)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(expires_policy)
|
||||||
|
.execute(conn)
|
||||||
|
.map_err(|e| format!("Failed to update refresh policy: {}", e))?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -193,7 +401,16 @@ pub fn execute_use_website_preprocessing(
|
||||||
url: &str,
|
url: &str,
|
||||||
bot_id: Uuid,
|
bot_id: Uuid,
|
||||||
) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
|
) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
|
||||||
trace!("Preprocessing USE_WEBSITE: {}, bot_id: {:?}", url, bot_id);
|
execute_use_website_preprocessing_with_refresh(conn, url, bot_id, "1m") // Default: 1 month
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn execute_use_website_preprocessing_with_refresh(
|
||||||
|
conn: &mut PgConnection,
|
||||||
|
url: &str,
|
||||||
|
bot_id: Uuid,
|
||||||
|
refresh_interval: &str,
|
||||||
|
) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
|
||||||
|
trace!("Preprocessing USE_WEBSITE: {}, bot_id: {:?}, refresh: {}", url, bot_id, refresh_interval);
|
||||||
|
|
||||||
if !url.starts_with("http://") && !url.starts_with("https://") {
|
if !url.starts_with("http://") && !url.starts_with("https://") {
|
||||||
return Err(format!(
|
return Err(format!(
|
||||||
|
|
@ -203,12 +420,13 @@ pub fn execute_use_website_preprocessing(
|
||||||
.into());
|
.into());
|
||||||
}
|
}
|
||||||
|
|
||||||
register_website_for_crawling(conn, &bot_id, url)?;
|
register_website_for_crawling_with_refresh(conn, &bot_id, url, refresh_interval)?;
|
||||||
|
|
||||||
Ok(serde_json::json!({
|
Ok(serde_json::json!({
|
||||||
"command": "use_website",
|
"command": "use_website",
|
||||||
"url": url,
|
"url": url,
|
||||||
"bot_id": bot_id.to_string(),
|
"bot_id": bot_id.to_string(),
|
||||||
|
"refresh_policy": refresh_interval,
|
||||||
"status": "registered_for_crawling"
|
"status": "registered_for_crawling"
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ use crate::shared::state::AppState;
|
||||||
use crate::shared::utils::DbPool;
|
use crate::shared::utils::DbPool;
|
||||||
use diesel::prelude::*;
|
use diesel::prelude::*;
|
||||||
use log::{error, info, warn};
|
use log::{error, info, warn};
|
||||||
|
use regex;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::time::{interval, Duration};
|
use tokio::time::{interval, Duration};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
@ -22,7 +23,7 @@ impl WebsiteCrawlerService {
|
||||||
Self {
|
Self {
|
||||||
db_pool,
|
db_pool,
|
||||||
kb_manager,
|
kb_manager,
|
||||||
check_interval: Duration::from_secs(3600),
|
check_interval: Duration::from_secs(60),
|
||||||
running: Arc::new(tokio::sync::RwLock::new(false)),
|
running: Arc::new(tokio::sync::RwLock::new(false)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -57,10 +58,13 @@ impl WebsiteCrawlerService {
|
||||||
fn check_and_crawl_websites(&self) -> Result<(), Box<dyn std::error::Error>> {
|
fn check_and_crawl_websites(&self) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
info!("Checking for websites that need recrawling");
|
info!("Checking for websites that need recrawling");
|
||||||
|
|
||||||
|
// First, scan for new USE WEBSITE commands in .bas files
|
||||||
|
self.scan_and_register_websites_from_scripts()?;
|
||||||
|
|
||||||
let mut conn = self.db_pool.get()?;
|
let mut conn = self.db_pool.get()?;
|
||||||
|
|
||||||
let websites = diesel::sql_query(
|
let websites = diesel::sql_query(
|
||||||
"SELECT id, bot_id, url, expires_policy, max_depth, max_pages
|
"SELECT id, bot_id, url, expires_policy, refresh_policy, max_depth, max_pages
|
||||||
FROM website_crawls
|
FROM website_crawls
|
||||||
WHERE next_crawl <= NOW()
|
WHERE next_crawl <= NOW()
|
||||||
AND crawl_status != 2
|
AND crawl_status != 2
|
||||||
|
|
@ -116,6 +120,7 @@ impl WebsiteCrawlerService {
|
||||||
max_pages: website_max_pages,
|
max_pages: website_max_pages,
|
||||||
crawl_delay_ms: 500,
|
crawl_delay_ms: 500,
|
||||||
expires_policy: website.expires_policy.clone(),
|
expires_policy: website.expires_policy.clone(),
|
||||||
|
refresh_policy: website.refresh_policy.clone(),
|
||||||
last_crawled: None,
|
last_crawled: None,
|
||||||
next_crawl: None,
|
next_crawl: None,
|
||||||
};
|
};
|
||||||
|
|
@ -207,6 +212,103 @@ impl WebsiteCrawlerService {
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn scan_and_register_websites_from_scripts(&self) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
info!("Scanning .bas files for USE WEBSITE commands");
|
||||||
|
|
||||||
|
let work_dir = std::path::Path::new("work");
|
||||||
|
if !work_dir.exists() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut conn = self.db_pool.get()?;
|
||||||
|
|
||||||
|
for entry in std::fs::read_dir(work_dir)? {
|
||||||
|
let entry = entry?;
|
||||||
|
let path = entry.path();
|
||||||
|
|
||||||
|
if path.is_dir() && path.file_name().unwrap().to_string_lossy().ends_with(".gbai") {
|
||||||
|
let bot_name = path.file_name().unwrap().to_string_lossy().replace(".gbai", "");
|
||||||
|
|
||||||
|
// Get bot_id from database
|
||||||
|
#[derive(QueryableByName)]
|
||||||
|
struct BotIdResult {
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Uuid)]
|
||||||
|
id: uuid::Uuid,
|
||||||
|
}
|
||||||
|
|
||||||
|
let bot_id_result: Result<BotIdResult, _> = diesel::sql_query("SELECT id FROM bots WHERE name = $1")
|
||||||
|
.bind::<diesel::sql_types::Text, _>(&bot_name)
|
||||||
|
.get_result(&mut conn);
|
||||||
|
|
||||||
|
let bot_id = match bot_id_result {
|
||||||
|
Ok(result) => result.id,
|
||||||
|
Err(_) => continue, // Skip if bot not found
|
||||||
|
};
|
||||||
|
|
||||||
|
// Scan .gbdialog directory for .bas files
|
||||||
|
let dialog_dir = path.join(format!("{}.gbdialog", bot_name));
|
||||||
|
if dialog_dir.exists() {
|
||||||
|
self.scan_directory_for_websites(&dialog_dir, bot_id, &mut conn)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scan_directory_for_websites(
|
||||||
|
&self,
|
||||||
|
dir: &std::path::Path,
|
||||||
|
bot_id: uuid::Uuid,
|
||||||
|
conn: &mut diesel::PgConnection,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
for entry in std::fs::read_dir(dir)? {
|
||||||
|
let entry = entry?;
|
||||||
|
let path = entry.path();
|
||||||
|
|
||||||
|
if path.extension().map_or(false, |ext| ext == "bas") {
|
||||||
|
let content = std::fs::read_to_string(&path)?;
|
||||||
|
|
||||||
|
// Regex to find USE WEBSITE commands with optional REFRESH parameter
|
||||||
|
let re = regex::Regex::new(r#"USE\s+WEBSITE\s+"([^"]+)"(?:\s+REFRESH\s+"([^"]+)")?"#)?;
|
||||||
|
|
||||||
|
for cap in re.captures_iter(&content) {
|
||||||
|
if let Some(url) = cap.get(1) {
|
||||||
|
let url_str = url.as_str();
|
||||||
|
let refresh_str = cap.get(2).map(|m| m.as_str()).unwrap_or("1m");
|
||||||
|
|
||||||
|
// Check if already registered
|
||||||
|
let exists = diesel::sql_query(
|
||||||
|
"SELECT COUNT(*) as count FROM website_crawls WHERE bot_id = $1 AND url = $2"
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Uuid, _>(&bot_id)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(url_str)
|
||||||
|
.get_result::<CountResult>(conn)
|
||||||
|
.map(|r| r.count)
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
if exists == 0 {
|
||||||
|
info!("Auto-registering website {} for bot {} with refresh: {}", url_str, bot_id, refresh_str);
|
||||||
|
|
||||||
|
// Register website for crawling with refresh policy
|
||||||
|
crate::basic::keywords::use_website::register_website_for_crawling_with_refresh(
|
||||||
|
conn, &bot_id, url_str, refresh_str
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(QueryableByName)]
|
||||||
|
struct CountResult {
|
||||||
|
#[diesel(sql_type = diesel::sql_types::BigInt)]
|
||||||
|
count: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(QueryableByName, Debug)]
|
#[derive(QueryableByName, Debug)]
|
||||||
|
|
@ -219,6 +321,8 @@ struct WebsiteCrawlRecord {
|
||||||
url: String,
|
url: String,
|
||||||
#[diesel(sql_type = diesel::sql_types::Text)]
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
||||||
expires_policy: String,
|
expires_policy: String,
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
||||||
|
refresh_policy: Option<String>,
|
||||||
#[diesel(sql_type = diesel::sql_types::Integer)]
|
#[diesel(sql_type = diesel::sql_types::Integer)]
|
||||||
max_depth: i32,
|
max_depth: i32,
|
||||||
#[diesel(sql_type = diesel::sql_types::Integer)]
|
#[diesel(sql_type = diesel::sql_types::Integer)]
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue