botserver/src/basic/keywords/add_website.rs

166 lines
5.7 KiB
Rust
Raw Normal View History

2025-10-18 18:20:02 -03:00
use crate::shared::models::UserSession;
use crate::shared::state::AppState;
#[cfg(feature = "web_automation")]
use crate::web_automation::WebCrawler;
use log::{error, info};
use rhai::{Dynamic, Engine};
use std::sync::Arc;
pub fn add_website_keyword(state: Arc<AppState>, user: UserSession, engine: &mut Engine) {
let state_clone = Arc::clone(&state);
let user_clone = user.clone();
engine
.register_custom_syntax(&["ADD_WEBSITE", "$expr$"], false, move |context, inputs| {
let url = context.eval_expression_tree(&inputs[0])?;
let url_str = url.to_string().trim_matches('"').to_string();
info!(
"ADD_WEBSITE command executed: {} for user: {}",
url_str, user_clone.user_id
);
// Validate URL
#[cfg(feature = "web_automation")]
let is_valid = WebCrawler::is_valid_url(&url_str);
#[cfg(not(feature = "web_automation"))]
let is_valid = url_str.starts_with("http://") || url_str.starts_with("https://");
if !is_valid {
return Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
"Invalid URL format. Must start with http:// or https://".into(),
rhai::Position::NONE,
)));
}
let state_for_task = Arc::clone(&state_clone);
let user_for_task = user_clone.clone();
let url_for_task = url_str.clone();
// Spawn async task to crawl and index website
let (tx, rx) = std::sync::mpsc::channel();
std::thread::spawn(move || {
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(2)
.enable_all()
.build();
let send_err = if let Ok(rt) = rt {
let result = rt.block_on(async move {
crawl_and_index_website(&state_for_task, &user_for_task, &url_for_task)
.await
});
tx.send(result).err()
} else {
tx.send(Err("Failed to build tokio runtime".to_string()))
.err()
};
if send_err.is_some() {
error!("Failed to send result from thread");
}
});
match rx.recv_timeout(std::time::Duration::from_secs(120)) {
Ok(Ok(message)) => {
info!("ADD_WEBSITE completed: {}", message);
Ok(Dynamic::from(message))
}
Ok(Err(e)) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
e.into(),
rhai::Position::NONE,
))),
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
"ADD_WEBSITE timed out".into(),
rhai::Position::NONE,
)))
}
Err(e) => Err(Box::new(rhai::EvalAltResult::ErrorRuntime(
format!("ADD_WEBSITE failed: {}", e).into(),
rhai::Position::NONE,
))),
}
})
.unwrap();
}
/// Crawl website and index content
async fn crawl_and_index_website(
_state: &AppState,
user: &UserSession,
url: &str,
) -> Result<String, String> {
info!("Crawling website: {} for user: {}", url, user.user_id);
// Check if web_automation feature is enabled
#[cfg(not(feature = "web_automation"))]
{
return Err(
"Web automation feature not enabled. Recompile with --features web_automation"
.to_string(),
);
}
// Fetch website content (only compiled if feature enabled)
#[cfg(feature = "web_automation")]
{
let crawler = WebCrawler::new();
let text_content = crawler
.crawl(url)
.await
.map_err(|e| format!("Failed to crawl website: {}", e))?;
if text_content.trim().is_empty() {
return Err("No text content found on website".to_string());
}
info!(
"Extracted {} characters of text from website",
text_content.len()
);
// Create KB name from URL
let kb_name = format!(
"website_{}",
url.replace("https://", "")
.replace("http://", "")
.replace('/', "_")
.replace('.', "_")
.chars()
.take(50)
.collect::<String>()
);
// Create collection name for this user's website KB
let collection_name = format!("kb_{}_{}_{}", user.bot_id, user.user_id, kb_name);
// Ensure collection exists in Qdrant
crate::kb::qdrant_client::ensure_collection_exists(_state, &collection_name)
.await
.map_err(|e| format!("Failed to create Qdrant collection: {}", e))?;
// Index the content
crate::kb::embeddings::index_document(_state, &collection_name, url, &text_content)
.await
.map_err(|e| format!("Failed to index document: {}", e))?;
// Associate KB with user (not session)
add_website_kb_to_user(_state, user, &kb_name, url)
.await
.map_err(|e| format!("Failed to associate KB with user: {}", e))?;
info!(
"Website indexed successfully to collection: {}",
collection_name
);
Ok(format!(
"Website '{}' crawled and indexed successfully ({} characters)",
url,
text_content.len()
))
}
}