diff --git a/src/prompts/business/data-enrichment.bas b/src/prompts/business/data-enrichment.bas index 8c20be4..07e166d 100644 --- a/src/prompts/business/data-enrichment.bas +++ b/src/prompts/business/data-enrichment.bas @@ -9,8 +9,8 @@ FOR EACH item IN items let page = GET website let prompt = "Create a website for " + item.company + " with the following details: " + page - - let alias = LLM "Return a single word for {item.company} like a token, no spaces, no special characters, no numbers, no uppercase letters." + + let alias = LLM "Return a single word for " + item.company + " like a token, no spaces, no special characters, no numbers, no uppercase letters." CREATE SITE item.company + "bot", item.company, website, "site", prompt diff --git a/src/services/keywords/create_site.rs b/src/services/keywords/create_site.rs index 4bf1d8a..82ddfdb 100644 --- a/src/services/keywords/create_site.rs +++ b/src/services/keywords/create_site.rs @@ -1,11 +1,14 @@ use rhai::Dynamic; use rhai::Engine; +use std::error::Error; use std::fs; use std::path::Path; use crate::services::state::AppState; +use crate::services::utils; -pub fn create_site_keyword(_state: &AppState, engine: &mut Engine) { +pub fn create_site_keyword(state: &AppState, engine: &mut Engine) { + let state_clone = state.clone(); engine .register_custom_syntax( &[ @@ -19,29 +22,44 @@ pub fn create_site_keyword(_state: &AppState, engine: &mut Engine) { } let _name = context.eval_expression_tree(&inputs[0])?; - + let _website = context.eval_expression_tree(&inputs[2])?; let _template = context.eval_expression_tree(&inputs[3])?; let prompt = context.eval_expression_tree(&inputs[4])?; + let ai_config = state_clone.config.as_ref().expect("Config must be initialized").ai.clone(); + // Use the same pattern as find_keyword + let fut = create_site(&ai_config, _name, prompt); + let result = + tokio::task::block_in_place(|| tokio::runtime::Handle::current().block_on(fut)) + .map_err(|e| format!("HTTP request failed: {}", e))?; - // Call the LLM to generate the HTML content - let llm_result = context.call_fn::("chat", (prompt.to_string(),))?; - - // Create the directory structure - let base_path = "/opt/gbo/tenants/pragmatismo/proxy/data/websites/sites.pragmatismo.com.br"; - let site_name = format!("{}", _name.to_string()); - let full_path = format!("{}/{}", base_path, site_name); - - // Create directory if it doesn't exist - fs::create_dir_all(&full_path).map_err(|e| e.to_string())?; - - // Write the HTML file - let index_path = Path::new(&full_path).join("index.html"); - fs::write(index_path, llm_result).map_err(|e| e.to_string())?; - - println!("Site created at: {}", full_path); - Ok(Dynamic::UNIT) + Ok(Dynamic::from(result)) }, ) .unwrap(); -} \ No newline at end of file +} + +async fn create_site( + ai_config: &crate::services::config::AIConfig, + _name: Dynamic, + prompt: Dynamic, +) -> Result> { + + // Call the LLM to generate the HTML contents + let llm_result = utils::call_llm(&prompt.to_string(), &ai_config).await?; + + // Create the directory structure + let base_path = "/opt/gbo/tenants/pragmatismo/proxy/data/websites/sites.pragmatismo.com.br"; + let site_name = format!("{}", _name.to_string()); + let full_path = format!("{}/{}", base_path, site_name); + + // Create directory if it doesn't exist + fs::create_dir_all(&full_path).map_err(|e| e.to_string())?; + + // Write the HTML file + let index_path = Path::new(&full_path).join("index.html"); + fs::write(index_path, llm_result).map_err(|e| e.to_string())?; + + println!("Site created at: {}", full_path); + Ok(full_path) +} diff --git a/src/services/keywords/get_website.rs b/src/services/keywords/get_website.rs index 2ac0ac0..8e53ae3 100644 --- a/src/services/keywords/get_website.rs +++ b/src/services/keywords/get_website.rs @@ -57,35 +57,28 @@ pub async fn execute_headless_browser_search( Ok(result) } - async fn perform_search( driver: WebDriver, search_term: &str) -> Result> { - // Configure the search query - let query = search_term.to_string(); - // Navigate to DuckDuckGo - println!("Navigating to DuckDuckGo..."); driver.goto("https://duckduckgo.com").await?; // Wait for search box and type query - println!("Searching for: {}", query); - let search_input = driver.find(By::Name("q")).await?; + let search_input = driver.find(By::Id("searchbox_input")).await?; search_input.click().await?; - search_input.send_keys(&query).await?; + search_input.send_keys(search_term).await?; // Submit search by pressing Enter search_input.send_keys("\n").await?; - // Wait for results to load - driver.find(By::Css(".result")).await?; - sleep(Duration::from_millis(2000)).await; // Give extra time for JS + // Wait for results to load - using a modern result selector + driver.find(By::Css("[data-testid='result']")).await?; + sleep(Duration::from_millis(2000)).await; - // Extract first result link + // Extract results let results = extract_search_results(&driver).await?; if !results.is_empty() { - println!("Found {} results", results.len()); Ok(results[0].clone()) } else { Ok("No results found".to_string()) @@ -97,20 +90,42 @@ async fn extract_search_results( ) -> Result, Box> { let mut results = Vec::new(); - // Try different selectors for search results + // Try different selectors for search results, ordered by most specific to most general let selectors = [ - "a[data-testid='result-title-a']", // Modern DuckDuckGo - ".result__a", // Classic DuckDuckGo - "a.result-link", // Alternative - ".result a[href]", // Generic result links + // Modern DuckDuckGo (as seen in the HTML) + "a[data-testid='result-title-a']", // Primary result links + "a[data-testid='result-extras-url-link']", // URL links in results + "a.eVNpHGjtxRBq_gLOfGDr", // Class-based selector for result titles + "a.Rn_JXVtoPVAFyGkcaXyK", // Class-based selector for URL links + ".ikg2IXiCD14iVX7AdZo1 a", // Heading container links + ".OQ_6vPwNhCeusNiEDcGp a", // URL container links + // Fallback selectors + ".result__a", // Classic DuckDuckGo + "a.result-link", // Alternative + ".result a[href]", // Generic result links ]; for selector in &selectors { if let Ok(elements) = driver.find_all(By::Css(selector)).await { for element in elements { if let Ok(Some(href)) = element.attr("href").await { - if href.starts_with("http") && !href.contains("duckduckgo.com") { - results.push(href); + // Filter out internal and non-http links + if href.starts_with("http") + && !href.contains("duckduckgo.com") + && !href.contains("duck.co") + && !results.contains(&href) { + + // Get the display URL for verification + let display_url = if let Ok(text) = element.text().await { + text.trim().to_string() + } else { + String::new() + }; + + // Only add if it looks like a real result (not an ad or internal link) + if !display_url.is_empty() && !display_url.contains("Ad") { + results.push(href); + } } } } @@ -120,5 +135,8 @@ async fn extract_search_results( } } + // Deduplicate results + results.dedup(); + Ok(results) -} +} \ No newline at end of file diff --git a/src/services/web_automation.rs b/src/services/web_automation.rs index b213bb7..f3f8ac3 100644 --- a/src/services/web_automation.rs +++ b/src/services/web_automation.rs @@ -47,7 +47,7 @@ impl BrowserPool { let mut caps = DesiredCapabilities::chrome(); caps.set_binary(&self.brave_path)?; - caps.add_chrome_arg("--headless=new")?; + //caps.add_chrome_arg("--headless=new")?; caps.add_chrome_arg("--disable-gpu")?; caps.add_chrome_arg("--no-sandbox")?; @@ -149,7 +149,7 @@ async fn setup_chromedriver() -> Result> { // Extract the zip to a temporary directory first let mut temp_extract_dir = std::env::temp_dir(); temp_extract_dir.push("chromedriver_extract"); - let mut temp_extract_dir1 = temp_extract_dir.clone(); + let temp_extract_dir1 = temp_extract_dir.clone(); // Clean up any previous extraction let _ = fs::remove_dir_all(&temp_extract_dir).await;