Fix issues: remove unused import, fix ownership error, reduce crawler interval

2026-01-30 12:21:30 -03:00 · 2026-01-30 12:21:30 -03:00 · 748fceff5d
commit 748fceff5d
parent 94fede7cc4
4 changed files with 359 additions and 21 deletions
--- a/migrations/core/6.2.1/down.sql
+++ b/migrations/core/6.2.1/down.sql
@ -0,0 +1,3 @@
 -- Remove the refresh_policy column from website_crawls table
 ALTER TABLE website_crawls
 DROP COLUMN IF EXISTS refresh_policy;
--- a/migrations/core/6.2.1/up.sql
+++ b/migrations/core/6.2.1/up.sql
@ -0,0 +1,13 @@
 -- Add refresh_policy column to website_crawls table
 -- This column stores the user-configured refresh interval (e.g., "1d", "1w", "1m", "1y")
 ALTER TABLE website_crawls
 ADD COLUMN IF NOT EXISTS refresh_policy VARCHAR(20);
 -- Update existing records to have a default refresh policy (1 month)
 UPDATE website_crawls
 SET refresh_policy = '1m'
 WHERE refresh_policy IS NULL;
 -- Add comment for documentation
 COMMENT ON COLUMN website_crawls.refresh_policy IS 'User-configured refresh interval (e.g., "1d", "1w", "1m", "1y") - shortest interval is used when duplicates exist';
--- a/src/basic/keywords/use_website.rs
+++ b/src/basic/keywords/use_website.rs
@ -6,21 +6,64 @@ use rhai::{Dynamic, Engine};
 use std::sync::Arc;
 use uuid::Uuid;
 /// Parse refresh interval string (e.g., "1d", "1w", "1m", "1y") into days
 /// Returns the number of days for the refresh interval
 fn parse_refresh_interval(interval: &str) -> Result<i32, String> {
    let interval_lower = interval.trim().to_lowercase();
    // Match patterns like "1d", "7d", "2w", "1m", "1y", etc.
    if interval_lower.ends_with('d') {
        let days: i32 = interval_lower[..interval_lower.len()-1]
            .parse()
            .map_err(|_| format!("Invalid days format: {}", interval))?;
        Ok(days)
    } else if interval_lower.ends_with('w') {
        let weeks: i32 = interval_lower[..interval_lower.len()-1]
            .parse()
            .map_err(|_| format!("Invalid weeks format: {}", interval))?;
        Ok(weeks * 7)
    } else if interval_lower.ends_with('m') {
        let months: i32 = interval_lower[..interval_lower.len()-1]
            .parse()
            .map_err(|_| format!("Invalid months format: {}", interval))?;
        Ok(months * 30) // Approximate month as 30 days
    } else if interval_lower.ends_with('y') {
        let years: i32 = interval_lower[..interval_lower.len()-1]
            .parse()
            .map_err(|_| format!("Invalid years format: {}", interval))?;
        Ok(years * 365) // Approximate year as 365 days
    } else {
        // Try to parse as plain number (assume days)
        interval.parse()
            .map_err(|_| format!("Invalid refresh interval format: {}. Use format like '1d', '1w', '1m', '1y'", interval))
    }
 }
 /// Convert days to expires_policy string format
 fn days_to_expires_policy(days: i32) -> String {
    format!("{}d", days)
 }
 pub fn use_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
    let state_clone = Arc::clone(&state);
-    let user_clone = user;
+    let user_clone = user.clone();
    // Register syntax for USE WEBSITE "url" REFRESH "interval"
    engine
        .register_custom_syntax(
-            ["USE", "WEBSITE", "$expr$"],
+            ["USE", "WEBSITE", "$expr$", "REFRESH", "$expr$"],
            false,
            move |context, inputs| {
                let url = context.eval_expression_tree(&inputs[0])?;
                let url_str = url.to_string().trim_matches('"').to_string();
                let refresh = context.eval_expression_tree(&inputs[1])?;
                let refresh_str = refresh.to_string().trim_matches('"').to_string();
                trace!(
-                    "USE WEBSITE command executed: {} for session: {}",
+                    "USE WEBSITE command executed: {} REFRESH {} for session: {}",
                    url_str,
                    refresh_str,
                    user_clone.id
                );
@ -35,6 +78,83 @@ pub fn use_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut
                let state_for_task = Arc::clone(&state_clone);
                let user_for_task = user_clone.clone();
                let url_for_task = url_str;
                let refresh_for_task = refresh_str;
                let (tx, rx) = std::sync::mpsc::channel();
                std::thread::spawn(move || {
                    let rt = tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(2)
                        .enable_all()
                        .build();
                    let send_err = if let Ok(_rt) = rt {
                        let result = associate_website_with_session_refresh(
                            &state_for_task,
                            &user_for_task,
                            &url_for_task,
                            &refresh_for_task,
                        );
                        tx.send(result).err()
                    } else {
                        tx.send(Err("Failed to build tokio runtime".to_string()))
                            .err()
                    };
                    if send_err.is_some() {
                        error!("Failed to send result from thread");
                    }
                });
                match rx.recv_timeout(std::time::Duration::from_secs(10)) {
                    Ok(Ok(message)) => Ok(Dynamic::from(message)),
                    Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
                        e.into(),
                        rhai::Position::NONE,
                    ))),
                    Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
                        Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
                            "USE WEBSITE timed out".into(),
                            rhai::Position::NONE,
                        )))
                    }
                    Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
                        format!("USE WEBSITE failed: {}", e).into(),
                        rhai::Position::NONE,
                    ))),
                }
            },
        )
        .expect("valid syntax registration");
    // Register syntax for USE WEBSITE "url" (without REFRESH)
    let state_clone2 = Arc::clone(&state);
    let user_clone2 = user.clone();
    engine
        .register_custom_syntax(
            ["USE", "WEBSITE", "$expr$"],
            false,
            move |context, inputs| {
                let url = context.eval_expression_tree(&inputs[0])?;
                let url_str = url.to_string().trim_matches('"').to_string();
                trace!(
                    "USE WEBSITE command executed: {} for session: {}",
                    url_str,
                    user_clone2.id
                );
                let is_valid = url_str.starts_with("http://") || url_str.starts_with("https://");
                if !is_valid {
                    return Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
                        "Invalid URL format. Must start with http:// or https://".into(),
                        rhai::Position::NONE,
                    )));
                }
                let state_for_task = Arc::clone(&state_clone2);
                let user_for_task = user_clone2.clone();
                let url_for_task = url_str;
                let (tx, rx) = std::sync::mpsc::channel();
                std::thread::spawn(move || {
@ -87,7 +207,16 @@ fn associate_website_with_session(
    user: &UserSession,
    url: &str,
 ) -> Result<String, String> {
-    info!("Associating website {} with session {}", url, user.id);
+    associate_website_with_session_refresh(state, user, url, "1m") // Default: 1 month
 }
 fn associate_website_with_session_refresh(
    state: &AppState,
    user: &UserSession,
    url: &str,
    refresh_interval: &str,
 ) -> Result<String, String> {
    info!("Associating website {} with session {} (refresh: {})", url, user.id, refresh_interval);
    let mut conn = state.conn.get().map_err(|e| format!("DB error: {}", e))?;
@ -97,16 +226,25 @@ fn associate_website_with_session(
    match website_status {
        WebsiteCrawlStatus::NotRegistered => {
-            return Err(format!(
+            // Auto-register website for crawling instead of failing
-                "Website {} has not been registered for crawling. It should be added to the script for preprocessing.",
+            info!("Website {} not registered, auto-registering for crawling with refresh: {}", url, refresh_interval);
-                url
+            register_website_for_crawling_with_refresh(&mut conn, &user.bot_id, url, refresh_interval)
                .map_err(|e| format!("Failed to register website: {}", e))?;
            return Ok(format!(
                "Website {} has been registered for crawling (refresh: {}). It will be available once crawling completes.",
                url, refresh_interval
            ));
        }
        WebsiteCrawlStatus::Pending => {
            info!("Website {} is pending crawl, associating anyway", url);
            // Update refresh policy if needed
            update_refresh_policy_if_shorter(&mut conn, &user.bot_id, url, refresh_interval)?;
        }
        WebsiteCrawlStatus::Crawled => {
            info!("Website {} is already crawled and ready", url);
            // Update refresh policy if needed
            update_refresh_policy_if_shorter(&mut conn, &user.bot_id, url, refresh_interval)?;
        }
        WebsiteCrawlStatus::Failed => {
            return Err(format!(
@ -165,26 +303,96 @@ pub fn register_website_for_crawling(
    bot_id: &Uuid,
    url: &str,
 ) -> Result<(), String> {
-    let expires_policy = "1d";
+    register_website_for_crawling_with_refresh(conn, bot_id, url, "1m") // Default: 1 month
 }
 pub fn register_website_for_crawling_with_refresh(
    conn: &mut PgConnection,
    bot_id: &Uuid,
    url: &str,
    refresh_interval: &str,
 ) -> Result<(), String> {
    let days = parse_refresh_interval(refresh_interval)
        .map_err(|e| format!("Invalid refresh interval: {}", e))?;
    let expires_policy = days_to_expires_policy(days);
    let query = diesel::sql_query(
-        "INSERT INTO website_crawls (id, bot_id, url, expires_policy, crawl_status, next_crawl)
+        "INSERT INTO website_crawls (id, bot_id, url, expires_policy, crawl_status, next_crawl, refresh_policy)
-         VALUES (gen_random_uuid(), $1, $2, $3, 0, NOW())
+         VALUES (gen_random_uuid(), $1, $2, $3, 0, NOW(), $4)
-         ON CONFLICT (bot_id, url) DO UPDATE SET next_crawl =
+         ON CONFLICT (bot_id, url) DO UPDATE SET
-         CASE
+            next_crawl = CASE
-            WHEN website_crawls.crawl_status = 2 THEN NOW()  -- Failed, retry now
+                WHEN website_crawls.crawl_status = 2 THEN NOW()  -- Failed, retry now
-            ELSE website_crawls.next_crawl  -- Keep existing schedule
+                ELSE website_crawls.next_crawl  -- Keep existing schedule
-         END",
+            END,
            refresh_policy = CASE
                WHEN website_crawls.refresh_policy IS NULL THEN $4
                ELSE LEAST(website_crawls.refresh_policy, $4)  -- Use shorter interval
            END",
    )
    .bind::<diesel::sql_types::Uuid, _>(bot_id)
    .bind::<diesel::sql_types::Text, _>(url)
-    .bind::<diesel::sql_types::Text, _>(expires_policy);
+    .bind::<diesel::sql_types::Text, _>(expires_policy)
    .bind::<diesel::sql_types::Text, _>(refresh_interval);
    query
        .execute(conn)
        .map_err(|e| format!("Failed to register website for crawling: {}", e))?;
-    info!("Website {} registered for crawling for bot {}", url, bot_id);
+    info!("Website {} registered for crawling for bot {} with refresh policy: {}", url, bot_id, refresh_interval);
    Ok(())
 }
 /// Update refresh policy if the new interval is shorter than the existing one
 fn update_refresh_policy_if_shorter(
    conn: &mut PgConnection,
    bot_id: &Uuid,
    url: &str,
    refresh_interval: &str,
 ) -> Result<(), String> {
    // Get current record to compare in Rust (no SQL business logic!)
    #[derive(QueryableByName)]
    struct CurrentRefresh {
        #[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
        refresh_policy: Option<String>,
    }
    let current = diesel::sql_query(
        "SELECT refresh_policy FROM website_crawls WHERE bot_id = $1 AND url = $2"
    )
    .bind::<diesel::sql_types::Uuid, _>(bot_id)
    .bind::<diesel::sql_types::Text, _>(url)
    .get_result::<CurrentRefresh>(conn)
    .ok();
    let new_days = parse_refresh_interval(refresh_interval)
        .map_err(|e| format!("Invalid refresh interval: {}", e))?;
    // Check if we should update (no policy exists or new interval is shorter)
    let should_update = match &current {
        Some(c) if c.refresh_policy.is_some() => {
            let existing_days = parse_refresh_interval(c.refresh_policy.as_ref().unwrap())
                .unwrap_or(i32::MAX);
            new_days < existing_days
        }
        _ => true, // No existing policy, so update
    };
    if should_update {
        let expires_policy = days_to_expires_policy(new_days);
        diesel::sql_query(
            "UPDATE website_crawls SET refresh_policy = $3, expires_policy = $4
             WHERE bot_id = $1 AND url = $2"
        )
        .bind::<diesel::sql_types::Uuid, _>(bot_id)
        .bind::<diesel::sql_types::Text, _>(url)
        .bind::<diesel::sql_types::Text, _>(refresh_interval)
        .bind::<diesel::sql_types::Text, _>(expires_policy)
        .execute(conn)
        .map_err(|e| format!("Failed to update refresh policy: {}", e))?;
    }
    Ok(())
 }
@ -193,7 +401,16 @@ pub fn execute_use_website_preprocessing(
    url: &str,
    bot_id: Uuid,
 ) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
-    trace!("Preprocessing USE_WEBSITE: {}, bot_id: {:?}", url, bot_id);
+    execute_use_website_preprocessing_with_refresh(conn, url, bot_id, "1m") // Default: 1 month
 }
 pub fn execute_use_website_preprocessing_with_refresh(
    conn: &mut PgConnection,
    url: &str,
    bot_id: Uuid,
    refresh_interval: &str,
 ) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
    trace!("Preprocessing USE_WEBSITE: {}, bot_id: {:?}, refresh: {}", url, bot_id, refresh_interval);
    if !url.starts_with("http://") && !url.starts_with("https://") {
        return Err(format!(
@ -203,12 +420,13 @@ pub fn execute_use_website_preprocessing(
        .into());
    }
-    register_website_for_crawling(conn, &bot_id, url)?;
+    register_website_for_crawling_with_refresh(conn, &bot_id, url, refresh_interval)?;
    Ok(serde_json::json!({
        "command": "use_website",
        "url": url,
        "bot_id": bot_id.to_string(),
        "refresh_policy": refresh_interval,
        "status": "registered_for_crawling"
    }))
 }
--- a/src/core/kb/website_crawler_service.rs
+++ b/src/core/kb/website_crawler_service.rs
@ -5,6 +5,7 @@ use crate::shared::state::AppState;
 use crate::shared::utils::DbPool;
 use diesel::prelude::*;
 use log::{error, info, warn};
 use regex;
 use std::sync::Arc;
 use tokio::time::{interval, Duration};
 use uuid::Uuid;
@ -22,7 +23,7 @@ impl WebsiteCrawlerService {
        Self {
            db_pool,
            kb_manager,
-            check_interval: Duration::from_secs(3600),
+            check_interval: Duration::from_secs(60),
            running: Arc::new(tokio::sync::RwLock::new(false)),
        }
    }
@ -57,10 +58,13 @@ impl WebsiteCrawlerService {
    fn check_and_crawl_websites(&self) -> Result<(), Box<dyn std::error::Error>> {
        info!("Checking for websites that need recrawling");
        // First, scan for new USE WEBSITE commands in .bas files
        self.scan_and_register_websites_from_scripts()?;
        let mut conn = self.db_pool.get()?;
        let websites = diesel::sql_query(
-            "SELECT id, bot_id, url, expires_policy, max_depth, max_pages
+            "SELECT id, bot_id, url, expires_policy, refresh_policy, max_depth, max_pages
             FROM website_crawls
             WHERE next_crawl <= NOW()
             AND crawl_status != 2
@ -116,6 +120,7 @@ impl WebsiteCrawlerService {
            max_pages: website_max_pages,
            crawl_delay_ms: 500,
            expires_policy: website.expires_policy.clone(),
            refresh_policy: website.refresh_policy.clone(),
            last_crawled: None,
            next_crawl: None,
        };
@ -207,6 +212,103 @@ impl WebsiteCrawlerService {
        Ok(())
    }
    fn scan_and_register_websites_from_scripts(&self) -> Result<(), Box<dyn std::error::Error>> {
        info!("Scanning .bas files for USE WEBSITE commands");
        let work_dir = std::path::Path::new("work");
        if !work_dir.exists() {
            return Ok(());
        }
        let mut conn = self.db_pool.get()?;
        for entry in std::fs::read_dir(work_dir)? {
            let entry = entry?;
            let path = entry.path();
            if path.is_dir() && path.file_name().unwrap().to_string_lossy().ends_with(".gbai") {
                let bot_name = path.file_name().unwrap().to_string_lossy().replace(".gbai", "");
                // Get bot_id from database
                #[derive(QueryableByName)]
                struct BotIdResult {
                    #[diesel(sql_type = diesel::sql_types::Uuid)]
                    id: uuid::Uuid,
                }
                let bot_id_result: Result<BotIdResult, _> = diesel::sql_query("SELECT id FROM bots WHERE name = $1")
                    .bind::<diesel::sql_types::Text, _>(&bot_name)
                    .get_result(&mut conn);
                let bot_id = match bot_id_result {
                    Ok(result) => result.id,
                    Err(_) => continue, // Skip if bot not found
                };
                // Scan .gbdialog directory for .bas files
                let dialog_dir = path.join(format!("{}.gbdialog", bot_name));
                if dialog_dir.exists() {
                    self.scan_directory_for_websites(&dialog_dir, bot_id, &mut conn)?;
                }
            }
        }
        Ok(())
    }
    fn scan_directory_for_websites(
        &self,
        dir: &std::path::Path,
        bot_id: uuid::Uuid,
        conn: &mut diesel::PgConnection,
    ) -> Result<(), Box<dyn std::error::Error>> {
        for entry in std::fs::read_dir(dir)? {
            let entry = entry?;
            let path = entry.path();
            if path.extension().map_or(false, |ext| ext == "bas") {
                let content = std::fs::read_to_string(&path)?;
                // Regex to find USE WEBSITE commands with optional REFRESH parameter
                let re = regex::Regex::new(r#"USE\s+WEBSITE\s+"([^"]+)"(?:\s+REFRESH\s+"([^"]+)")?"#)?;
                for cap in re.captures_iter(&content) {
                    if let Some(url) = cap.get(1) {
                        let url_str = url.as_str();
                        let refresh_str = cap.get(2).map(|m| m.as_str()).unwrap_or("1m");
                        // Check if already registered
                        let exists = diesel::sql_query(
                            "SELECT COUNT(*) as count FROM website_crawls WHERE bot_id = $1 AND url = $2"
                        )
                        .bind::<diesel::sql_types::Uuid, _>(&bot_id)
                        .bind::<diesel::sql_types::Text, _>(url_str)
                        .get_result::<CountResult>(conn)
                        .map(|r| r.count)
                        .unwrap_or(0);
                        if exists == 0 {
                            info!("Auto-registering website {} for bot {} with refresh: {}", url_str, bot_id, refresh_str);
                            // Register website for crawling with refresh policy
                            crate::basic::keywords::use_website::register_website_for_crawling_with_refresh(
                                conn, &bot_id, url_str, refresh_str
                            )?;
                        }
                    }
                }
            }
        }
        Ok(())
    }
 }
 #[derive(QueryableByName)]
 struct CountResult {
    #[diesel(sql_type = diesel::sql_types::BigInt)]
    count: i64,
 }
 #[derive(QueryableByName, Debug)]
@ -219,6 +321,8 @@ struct WebsiteCrawlRecord {
    url: String,
    #[diesel(sql_type = diesel::sql_types::Text)]
    expires_policy: String,
    #[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
    refresh_policy: Option<String>,
    #[diesel(sql_type = diesel::sql_types::Integer)]
    max_depth: i32,
    #[diesel(sql_type = diesel::sql_types::Integer)]