From c374f27567144e6ba8dec7a62ebdbf31d9dd58d9 Mon Sep 17 00:00:00 2001 From: Rodrigo Rodriguez Date: Wed, 22 May 2024 13:21:29 -0300 Subject: [PATCH] new(all): Auto import for logo, colors and website content. --- packages/kb.gbapp/services/KBService.ts | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/kb.gbapp/services/KBService.ts b/packages/kb.gbapp/services/KBService.ts index 91ba8c4f..f21c56c2 100644 --- a/packages/kb.gbapp/services/KBService.ts +++ b/packages/kb.gbapp/services/KBService.ts @@ -879,7 +879,7 @@ export class KBService implements IGBKBService { return null; } - async crawl(min, url: string, visited: Set, depth: number, maxDepth: number, page: Page): Promise { + async crawl(min, url: string, visited: Set, depth: number, maxDepth: number, page: Page, websiteIgnoreUrls): Promise { try { if ( depth > maxDepth || @@ -904,10 +904,18 @@ export class KBService implements IGBKBService { return []; } const currentDomain = new URL(page.url()).hostname; - let links = await page.evaluate(currentDomain => { + + + let links = await page.evaluate(({currentDomain, websiteIgnoreUrls}) => { const anchors = Array.from(document.querySelectorAll('a')).filter(p => { try { - return currentDomain == new URL(p.href).hostname; + + // Check if urlToCheck contains any of the ignored URLs + + const isIgnored = websiteIgnoreUrls.split(";").some(ignoredUrl => p.href.includes(ignoredUrl)); + + return !isIgnored && currentDomain == new URL(p.href).hostname; + } catch (err) { return false; } @@ -916,7 +924,7 @@ export class KBService implements IGBKBService { return anchors.map(anchor => { return anchor.href.replace(/#.*/, ''); }); - }, currentDomain); + }, {currentDomain, websiteIgnoreUrls}); if (!Array.isArray(links)) { links = []; @@ -1009,7 +1017,8 @@ export class KBService implements IGBKBService { const website = min.core.getParam(min.instance, 'Website', null); - + const websiteIgnoreUrls = min.core.getParam(min.instance, 'Website Ignore URLs', null); + if (website) { Fs.rmSync(min['vectorStorePath'], { recursive: true, force: true }); @@ -1063,10 +1072,10 @@ export class KBService implements IGBKBService { page.setDefaultTimeout(15000); page.setCacheEnabled(false); - + const maxDepth = 2; // Maximum depth of recursion const visited = new Set(); - files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page)); + files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page, websiteIgnoreUrls)); await browser.close();