new(all): Auto import for logo, colors and website content.
This commit is contained in:
parent
7e6b2807a1
commit
c374f27567
1 changed files with 16 additions and 7 deletions
|
@ -879,7 +879,7 @@ export class KBService implements IGBKBService {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(min, url: string, visited: Set<string>, depth: number, maxDepth: number, page: Page): Promise<string[]> {
|
async crawl(min, url: string, visited: Set<string>, depth: number, maxDepth: number, page: Page, websiteIgnoreUrls): Promise<string[]> {
|
||||||
try {
|
try {
|
||||||
if (
|
if (
|
||||||
depth > maxDepth ||
|
depth > maxDepth ||
|
||||||
|
@ -904,10 +904,18 @@ export class KBService implements IGBKBService {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
const currentDomain = new URL(page.url()).hostname;
|
const currentDomain = new URL(page.url()).hostname;
|
||||||
let links = await page.evaluate(currentDomain => {
|
|
||||||
|
|
||||||
|
let links = await page.evaluate(({currentDomain, websiteIgnoreUrls}) => {
|
||||||
const anchors = Array.from(document.querySelectorAll('a')).filter(p => {
|
const anchors = Array.from(document.querySelectorAll('a')).filter(p => {
|
||||||
try {
|
try {
|
||||||
return currentDomain == new URL(p.href).hostname;
|
|
||||||
|
// Check if urlToCheck contains any of the ignored URLs
|
||||||
|
|
||||||
|
const isIgnored = websiteIgnoreUrls.split(";").some(ignoredUrl => p.href.includes(ignoredUrl));
|
||||||
|
|
||||||
|
return !isIgnored && currentDomain == new URL(p.href).hostname;
|
||||||
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -916,7 +924,7 @@ export class KBService implements IGBKBService {
|
||||||
return anchors.map(anchor => {
|
return anchors.map(anchor => {
|
||||||
return anchor.href.replace(/#.*/, '');
|
return anchor.href.replace(/#.*/, '');
|
||||||
});
|
});
|
||||||
}, currentDomain);
|
}, {currentDomain, websiteIgnoreUrls});
|
||||||
|
|
||||||
if (!Array.isArray(links)) {
|
if (!Array.isArray(links)) {
|
||||||
links = [];
|
links = [];
|
||||||
|
@ -1009,7 +1017,8 @@ export class KBService implements IGBKBService {
|
||||||
|
|
||||||
|
|
||||||
const website = min.core.getParam<string>(min.instance, 'Website', null);
|
const website = min.core.getParam<string>(min.instance, 'Website', null);
|
||||||
|
const websiteIgnoreUrls = min.core.getParam<string>(min.instance, 'Website Ignore URLs', null);
|
||||||
|
|
||||||
if (website) {
|
if (website) {
|
||||||
|
|
||||||
Fs.rmSync(min['vectorStorePath'], { recursive: true, force: true });
|
Fs.rmSync(min['vectorStorePath'], { recursive: true, force: true });
|
||||||
|
@ -1063,10 +1072,10 @@ export class KBService implements IGBKBService {
|
||||||
|
|
||||||
page.setDefaultTimeout(15000);
|
page.setDefaultTimeout(15000);
|
||||||
page.setCacheEnabled(false);
|
page.setCacheEnabled(false);
|
||||||
|
|
||||||
const maxDepth = 2; // Maximum depth of recursion
|
const maxDepth = 2; // Maximum depth of recursion
|
||||||
const visited = new Set<string>();
|
const visited = new Set<string>();
|
||||||
files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page));
|
files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page, websiteIgnoreUrls));
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue