Fix issues: remove unused import, fix ownership error, reduce crawler interval

This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2026-01-30 12:21:30 -03:00
parent 94fede7cc4
commit 748fceff5d
4 changed files with 359 additions and 21 deletions

View file

@ -0,0 +1,3 @@
-- Remove the refresh_policy column from website_crawls table
ALTER TABLE website_crawls
DROP COLUMN IF EXISTS refresh_policy;

View file

@ -0,0 +1,13 @@
-- Add refresh_policy column to website_crawls table
-- This column stores the user-configured refresh interval (e.g., "1d", "1w", "1m", "1y")
ALTER TABLE website_crawls
ADD COLUMN IF NOT EXISTS refresh_policy VARCHAR(20);
-- Update existing records to have a default refresh policy (1 month)
UPDATE website_crawls
SET refresh_policy = '1m'
WHERE refresh_policy IS NULL;
-- Add comment for documentation
COMMENT ON COLUMN website_crawls.refresh_policy IS 'User-configured refresh interval (e.g., "1d", "1w", "1m", "1y") - shortest interval is used when duplicates exist';

View file

@ -6,21 +6,64 @@ use rhai::{Dynamic, Engine};
use std::sync::Arc;
use uuid::Uuid;
/// Parse refresh interval string (e.g., "1d", "1w", "1m", "1y") into days
/// Returns the number of days for the refresh interval
fn parse_refresh_interval(interval: &str) -> Result<i32, String> {
let interval_lower = interval.trim().to_lowercase();
// Match patterns like "1d", "7d", "2w", "1m", "1y", etc.
if interval_lower.ends_with('d') {
let days: i32 = interval_lower[..interval_lower.len()-1]
.parse()
.map_err(|_| format!("Invalid days format: {}", interval))?;
Ok(days)
} else if interval_lower.ends_with('w') {
let weeks: i32 = interval_lower[..interval_lower.len()-1]
.parse()
.map_err(|_| format!("Invalid weeks format: {}", interval))?;
Ok(weeks * 7)
} else if interval_lower.ends_with('m') {
let months: i32 = interval_lower[..interval_lower.len()-1]
.parse()
.map_err(|_| format!("Invalid months format: {}", interval))?;
Ok(months * 30) // Approximate month as 30 days
} else if interval_lower.ends_with('y') {
let years: i32 = interval_lower[..interval_lower.len()-1]
.parse()
.map_err(|_| format!("Invalid years format: {}", interval))?;
Ok(years * 365) // Approximate year as 365 days
} else {
// Try to parse as plain number (assume days)
interval.parse()
.map_err(|_| format!("Invalid refresh interval format: {}. Use format like '1d', '1w', '1m', '1y'", interval))
}
}
/// Convert days to expires_policy string format
fn days_to_expires_policy(days: i32) -> String {
format!("{}d", days)
}
pub fn use_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
let state_clone = Arc::clone(&state);
let user_clone = user;
let user_clone = user.clone();
// Register syntax for USE WEBSITE "url" REFRESH "interval"
engine
.register_custom_syntax(
["USE", "WEBSITE", "$expr$"],
["USE", "WEBSITE", "$expr$", "REFRESH", "$expr$"],
false,
move |context, inputs| {
let url = context.eval_expression_tree(&inputs[0])?;
let url_str = url.to_string().trim_matches('"').to_string();
let refresh = context.eval_expression_tree(&inputs[1])?;
let refresh_str = refresh.to_string().trim_matches('"').to_string();
trace!(
"USE WEBSITE command executed: {} for session: {}",
"USE WEBSITE command executed: {} REFRESH {} for session: {}",
url_str,
refresh_str,
user_clone.id
);
@ -35,6 +78,83 @@ pub fn use_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut
let state_for_task = Arc::clone(&state_clone);
let user_for_task = user_clone.clone();
let url_for_task = url_str;
let refresh_for_task = refresh_str;
let (tx, rx) = std::sync::mpsc::channel();
std::thread::spawn(move || {
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(2)
.enable_all()
.build();
let send_err = if let Ok(_rt) = rt {
let result = associate_website_with_session_refresh(
&state_for_task,
&user_for_task,
&url_for_task,
&refresh_for_task,
);
tx.send(result).err()
} else {
tx.send(Err("Failed to build tokio runtime".to_string()))
.err()
};
if send_err.is_some() {
error!("Failed to send result from thread");
}
});
match rx.recv_timeout(std::time::Duration::from_secs(10)) {
Ok(Ok(message)) => Ok(Dynamic::from(message)),
Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
e.into(),
rhai::Position::NONE,
))),
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
"USE WEBSITE timed out".into(),
rhai::Position::NONE,
)))
}
Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
format!("USE WEBSITE failed: {}", e).into(),
rhai::Position::NONE,
))),
}
},
)
.expect("valid syntax registration");
// Register syntax for USE WEBSITE "url" (without REFRESH)
let state_clone2 = Arc::clone(&state);
let user_clone2 = user.clone();
engine
.register_custom_syntax(
["USE", "WEBSITE", "$expr$"],
false,
move |context, inputs| {
let url = context.eval_expression_tree(&inputs[0])?;
let url_str = url.to_string().trim_matches('"').to_string();
trace!(
"USE WEBSITE command executed: {} for session: {}",
url_str,
user_clone2.id
);
let is_valid = url_str.starts_with("http://") || url_str.starts_with("https://");
if !is_valid {
return Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
"Invalid URL format. Must start with http:// or https://".into(),
rhai::Position::NONE,
)));
}
let state_for_task = Arc::clone(&state_clone2);
let user_for_task = user_clone2.clone();
let url_for_task = url_str;
let (tx, rx) = std::sync::mpsc::channel();
std::thread::spawn(move || {
@ -87,7 +207,16 @@ fn associate_website_with_session(
user: &UserSession,
url: &str,
) -> Result<String, String> {
info!("Associating website {} with session {}", url, user.id);
associate_website_with_session_refresh(state, user, url, "1m") // Default: 1 month
}
fn associate_website_with_session_refresh(
state: &AppState,
user: &UserSession,
url: &str,
refresh_interval: &str,
) -> Result<String, String> {
info!("Associating website {} with session {} (refresh: {})", url, user.id, refresh_interval);
let mut conn = state.conn.get().map_err(|e| format!("DB error: {}", e))?;
@ -97,16 +226,25 @@ fn associate_website_with_session(
match website_status {
WebsiteCrawlStatus::NotRegistered => {
return Err(format!(
"Website {} has not been registered for crawling. It should be added to the script for preprocessing.",
url
// Auto-register website for crawling instead of failing
info!("Website {} not registered, auto-registering for crawling with refresh: {}", url, refresh_interval);
register_website_for_crawling_with_refresh(&mut conn, &user.bot_id, url, refresh_interval)
.map_err(|e| format!("Failed to register website: {}", e))?;
return Ok(format!(
"Website {} has been registered for crawling (refresh: {}). It will be available once crawling completes.",
url, refresh_interval
));
}
WebsiteCrawlStatus::Pending => {
info!("Website {} is pending crawl, associating anyway", url);
// Update refresh policy if needed
update_refresh_policy_if_shorter(&mut conn, &user.bot_id, url, refresh_interval)?;
}
WebsiteCrawlStatus::Crawled => {
info!("Website {} is already crawled and ready", url);
// Update refresh policy if needed
update_refresh_policy_if_shorter(&mut conn, &user.bot_id, url, refresh_interval)?;
}
WebsiteCrawlStatus::Failed => {
return Err(format!(
@ -165,26 +303,96 @@ pub fn register_website_for_crawling(
bot_id: &Uuid,
url: &str,
) -> Result<(), String> {
let expires_policy = "1d";
register_website_for_crawling_with_refresh(conn, bot_id, url, "1m") // Default: 1 month
}
pub fn register_website_for_crawling_with_refresh(
conn: &mut PgConnection,
bot_id: &Uuid,
url: &str,
refresh_interval: &str,
) -> Result<(), String> {
let days = parse_refresh_interval(refresh_interval)
.map_err(|e| format!("Invalid refresh interval: {}", e))?;
let expires_policy = days_to_expires_policy(days);
let query = diesel::sql_query(
"INSERT INTO website_crawls (id, bot_id, url, expires_policy, crawl_status, next_crawl)
VALUES (gen_random_uuid(), $1, $2, $3, 0, NOW())
ON CONFLICT (bot_id, url) DO UPDATE SET next_crawl =
CASE
WHEN website_crawls.crawl_status = 2 THEN NOW() -- Failed, retry now
ELSE website_crawls.next_crawl -- Keep existing schedule
END",
"INSERT INTO website_crawls (id, bot_id, url, expires_policy, crawl_status, next_crawl, refresh_policy)
VALUES (gen_random_uuid(), $1, $2, $3, 0, NOW(), $4)
ON CONFLICT (bot_id, url) DO UPDATE SET
next_crawl = CASE
WHEN website_crawls.crawl_status = 2 THEN NOW() -- Failed, retry now
ELSE website_crawls.next_crawl -- Keep existing schedule
END,
refresh_policy = CASE
WHEN website_crawls.refresh_policy IS NULL THEN $4
ELSE LEAST(website_crawls.refresh_policy, $4) -- Use shorter interval
END",
)
.bind::<diesel::sql_types::Uuid, _>(bot_id)
.bind::<diesel::sql_types::Text, _>(url)
.bind::<diesel::sql_types::Text, _>(expires_policy);
.bind::<diesel::sql_types::Text, _>(expires_policy)
.bind::<diesel::sql_types::Text, _>(refresh_interval);
query
.execute(conn)
.map_err(|e| format!("Failed to register website for crawling: {}", e))?;
info!("Website {} registered for crawling for bot {}", url, bot_id);
info!("Website {} registered for crawling for bot {} with refresh policy: {}", url, bot_id, refresh_interval);
Ok(())
}
/// Update refresh policy if the new interval is shorter than the existing one
fn update_refresh_policy_if_shorter(
conn: &mut PgConnection,
bot_id: &Uuid,
url: &str,
refresh_interval: &str,
) -> Result<(), String> {
// Get current record to compare in Rust (no SQL business logic!)
#[derive(QueryableByName)]
struct CurrentRefresh {
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
refresh_policy: Option<String>,
}
let current = diesel::sql_query(
"SELECT refresh_policy FROM website_crawls WHERE bot_id = $1 AND url = $2"
)
.bind::<diesel::sql_types::Uuid, _>(bot_id)
.bind::<diesel::sql_types::Text, _>(url)
.get_result::<CurrentRefresh>(conn)
.ok();
let new_days = parse_refresh_interval(refresh_interval)
.map_err(|e| format!("Invalid refresh interval: {}", e))?;
// Check if we should update (no policy exists or new interval is shorter)
let should_update = match &current {
Some(c) if c.refresh_policy.is_some() => {
let existing_days = parse_refresh_interval(c.refresh_policy.as_ref().unwrap())
.unwrap_or(i32::MAX);
new_days < existing_days
}
_ => true, // No existing policy, so update
};
if should_update {
let expires_policy = days_to_expires_policy(new_days);
diesel::sql_query(
"UPDATE website_crawls SET refresh_policy = $3, expires_policy = $4
WHERE bot_id = $1 AND url = $2"
)
.bind::<diesel::sql_types::Uuid, _>(bot_id)
.bind::<diesel::sql_types::Text, _>(url)
.bind::<diesel::sql_types::Text, _>(refresh_interval)
.bind::<diesel::sql_types::Text, _>(expires_policy)
.execute(conn)
.map_err(|e| format!("Failed to update refresh policy: {}", e))?;
}
Ok(())
}
@ -193,7 +401,16 @@ pub fn execute_use_website_preprocessing(
url: &str,
bot_id: Uuid,
) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
trace!("Preprocessing USE_WEBSITE: {}, bot_id: {:?}", url, bot_id);
execute_use_website_preprocessing_with_refresh(conn, url, bot_id, "1m") // Default: 1 month
}
pub fn execute_use_website_preprocessing_with_refresh(
conn: &mut PgConnection,
url: &str,
bot_id: Uuid,
refresh_interval: &str,
) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
trace!("Preprocessing USE_WEBSITE: {}, bot_id: {:?}, refresh: {}", url, bot_id, refresh_interval);
if !url.starts_with("http://") && !url.starts_with("https://") {
return Err(format!(
@ -203,12 +420,13 @@ pub fn execute_use_website_preprocessing(
.into());
}
register_website_for_crawling(conn, &bot_id, url)?;
register_website_for_crawling_with_refresh(conn, &bot_id, url, refresh_interval)?;
Ok(serde_json::json!({
"command": "use_website",
"url": url,
"bot_id": bot_id.to_string(),
"refresh_policy": refresh_interval,
"status": "registered_for_crawling"
}))
}

View file

@ -5,6 +5,7 @@ use crate::shared::state::AppState;
use crate::shared::utils::DbPool;
use diesel::prelude::*;
use log::{error, info, warn};
use regex;
use std::sync::Arc;
use tokio::time::{interval, Duration};
use uuid::Uuid;
@ -22,7 +23,7 @@ impl WebsiteCrawlerService {
Self {
db_pool,
kb_manager,
check_interval: Duration::from_secs(3600),
check_interval: Duration::from_secs(60),
running: Arc::new(tokio::sync::RwLock::new(false)),
}
}
@ -57,10 +58,13 @@ impl WebsiteCrawlerService {
fn check_and_crawl_websites(&self) -> Result<(), Box<dyn std::error::Error>> {
info!("Checking for websites that need recrawling");
// First, scan for new USE WEBSITE commands in .bas files
self.scan_and_register_websites_from_scripts()?;
let mut conn = self.db_pool.get()?;
let websites = diesel::sql_query(
"SELECT id, bot_id, url, expires_policy, max_depth, max_pages
"SELECT id, bot_id, url, expires_policy, refresh_policy, max_depth, max_pages
FROM website_crawls
WHERE next_crawl <= NOW()
AND crawl_status != 2
@ -116,6 +120,7 @@ impl WebsiteCrawlerService {
max_pages: website_max_pages,
crawl_delay_ms: 500,
expires_policy: website.expires_policy.clone(),
refresh_policy: website.refresh_policy.clone(),
last_crawled: None,
next_crawl: None,
};
@ -207,6 +212,103 @@ impl WebsiteCrawlerService {
Ok(())
}
fn scan_and_register_websites_from_scripts(&self) -> Result<(), Box<dyn std::error::Error>> {
info!("Scanning .bas files for USE WEBSITE commands");
let work_dir = std::path::Path::new("work");
if !work_dir.exists() {
return Ok(());
}
let mut conn = self.db_pool.get()?;
for entry in std::fs::read_dir(work_dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() && path.file_name().unwrap().to_string_lossy().ends_with(".gbai") {
let bot_name = path.file_name().unwrap().to_string_lossy().replace(".gbai", "");
// Get bot_id from database
#[derive(QueryableByName)]
struct BotIdResult {
#[diesel(sql_type = diesel::sql_types::Uuid)]
id: uuid::Uuid,
}
let bot_id_result: Result<BotIdResult, _> = diesel::sql_query("SELECT id FROM bots WHERE name = $1")
.bind::<diesel::sql_types::Text, _>(&bot_name)
.get_result(&mut conn);
let bot_id = match bot_id_result {
Ok(result) => result.id,
Err(_) => continue, // Skip if bot not found
};
// Scan .gbdialog directory for .bas files
let dialog_dir = path.join(format!("{}.gbdialog", bot_name));
if dialog_dir.exists() {
self.scan_directory_for_websites(&dialog_dir, bot_id, &mut conn)?;
}
}
}
Ok(())
}
fn scan_directory_for_websites(
&self,
dir: &std::path::Path,
bot_id: uuid::Uuid,
conn: &mut diesel::PgConnection,
) -> Result<(), Box<dyn std::error::Error>> {
for entry in std::fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.extension().map_or(false, |ext| ext == "bas") {
let content = std::fs::read_to_string(&path)?;
// Regex to find USE WEBSITE commands with optional REFRESH parameter
let re = regex::Regex::new(r#"USE\s+WEBSITE\s+"([^"]+)"(?:\s+REFRESH\s+"([^"]+)")?"#)?;
for cap in re.captures_iter(&content) {
if let Some(url) = cap.get(1) {
let url_str = url.as_str();
let refresh_str = cap.get(2).map(|m| m.as_str()).unwrap_or("1m");
// Check if already registered
let exists = diesel::sql_query(
"SELECT COUNT(*) as count FROM website_crawls WHERE bot_id = $1 AND url = $2"
)
.bind::<diesel::sql_types::Uuid, _>(&bot_id)
.bind::<diesel::sql_types::Text, _>(url_str)
.get_result::<CountResult>(conn)
.map(|r| r.count)
.unwrap_or(0);
if exists == 0 {
info!("Auto-registering website {} for bot {} with refresh: {}", url_str, bot_id, refresh_str);
// Register website for crawling with refresh policy
crate::basic::keywords::use_website::register_website_for_crawling_with_refresh(
conn, &bot_id, url_str, refresh_str
)?;
}
}
}
}
}
Ok(())
}
}
#[derive(QueryableByName)]
struct CountResult {
#[diesel(sql_type = diesel::sql_types::BigInt)]
count: i64,
}
#[derive(QueryableByName, Debug)]
@ -219,6 +321,8 @@ struct WebsiteCrawlRecord {
url: String,
#[diesel(sql_type = diesel::sql_types::Text)]
expires_policy: String,
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
refresh_policy: Option<String>,
#[diesel(sql_type = diesel::sql_types::Integer)]
max_depth: i32,
#[diesel(sql_type = diesel::sql_types::Integer)]