new(all): Auto import for logo, colors and website content.

This commit is contained in:
Rodrigo Rodriguez 2024-05-23 14:11:33 -03:00
parent f42e42c5e2
commit fc1bfa8e18

View file

@ -859,7 +859,7 @@ export class KBService implements IGBKBService {
if (response.headers && response.status() === 200) { if (response.headers && response.status() === 200) {
const contentType = response.headers()['content-type']; const contentType = response.headers()['content-type'];
if (contentType && contentType.includes('text/html')) { if (contentType && contentType.includes('text/html')) {
const buffer = await page.$eval('*', el => el[ 'innerText']); const buffer = await page.$eval('*', el => el['innerText']);
const urlObj = new URL(url); const urlObj = new URL(url);
const urlPath = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slash if present const urlPath = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slash if present
let filename = urlPath.split('/').pop() || 'index'; // Get the filename from the URL path or set it to 'index.html' as default let filename = urlPath.split('/').pop() || 'index'; // Get the filename from the URL path or set it to 'index.html' as default
@ -879,7 +879,15 @@ export class KBService implements IGBKBService {
return null; return null;
} }
async crawl(min, url: string, visited: Set<string>, depth: number, maxDepth: number, page: Page, websiteIgnoreUrls): Promise<string[]> { async crawl(
min,
url: string,
visited: Set<string>,
depth: number,
maxDepth: number,
page: Page,
websiteIgnoreUrls
): Promise<string[]> {
try { try {
if ( if (
depth > maxDepth || depth > maxDepth ||
@ -905,17 +913,15 @@ export class KBService implements IGBKBService {
} }
const currentDomain = new URL(page.url()).hostname; const currentDomain = new URL(page.url()).hostname;
let links = await page.evaluate(
let links = await page.evaluate(({currentDomain, websiteIgnoreUrls}) => { ({ currentDomain, websiteIgnoreUrls }) => {
const anchors = Array.from(document.querySelectorAll('a')).filter(p => { const anchors = Array.from(document.querySelectorAll('a')).filter(p => {
try { try {
// Check if urlToCheck contains any of the ignored URLs // Check if urlToCheck contains any of the ignored URLs
const isIgnored = websiteIgnoreUrls.split(";").some(ignoredUrl => p.href.includes(ignoredUrl)); const isIgnored = websiteIgnoreUrls.split(';').some(ignoredUrl => p.href.includes(ignoredUrl));
return !isIgnored && currentDomain == new URL(p.href).hostname; return !isIgnored && currentDomain == new URL(p.href).hostname;
} catch (err) { } catch (err) {
return false; return false;
} }
@ -924,7 +930,9 @@ export class KBService implements IGBKBService {
return anchors.map(anchor => { return anchors.map(anchor => {
return anchor.href.replace(/#.*/, ''); return anchor.href.replace(/#.*/, '');
}); });
}, {currentDomain, websiteIgnoreUrls}); },
{ currentDomain, websiteIgnoreUrls }
);
if (!Array.isArray(links)) { if (!Array.isArray(links)) {
links = []; links = [];
@ -1015,12 +1023,10 @@ export class KBService implements IGBKBService {
): Promise<any> { ): Promise<any> {
let files = []; let files = [];
const website = min.core.getParam<string>(min.instance, 'Website', null); const website = min.core.getParam<string>(min.instance, 'Website', null);
const websiteIgnoreUrls = min.core.getParam<string>(min.instance, 'Website Ignore URLs', null); const websiteIgnoreUrls = min.core.getParam<string>(min.instance, 'Website Ignore URLs', null);
if (website) { if (website) {
Fs.rmSync(min['vectorStorePath'], { recursive: true, force: true }); Fs.rmSync(min['vectorStorePath'], { recursive: true, force: true });
let path = DialogKeywords.getGBAIPath(min.botId, `gbot`); let path = DialogKeywords.getGBAIPath(min.botId, `gbot`);
const directoryPath = Path.join(process.env.PWD, 'work', path, 'Website'); const directoryPath = Path.join(process.env.PWD, 'work', path, 'Website');
@ -1030,11 +1036,12 @@ export class KBService implements IGBKBService {
const page = await this.getFreshPage(browser, website); const page = await this.getFreshPage(browser, website);
let logo = await this.getLogoByPage(page); let logo = await this.getLogoByPage(page);
if (logo){ if (logo) {
path = DialogKeywords.getGBAIPath(min.botId); path = DialogKeywords.getGBAIPath(min.botId);
const logoPath = Path.join(process.env.PWD, 'work', path, 'cache'); const logoPath = Path.join(process.env.PWD, 'work', path, 'cache');
const baseUrl = page.url().split('/').slice(0, 3).join('/'); const baseUrl = page.url().split('/').slice(0, 3).join('/');
const logoBinary = await page.goto(urlJoin(baseUrl, logo)); logo = logo.startsWith('https') ? logo : urlJoin(baseUrl, logo);
const logoBinary = await page.goto(logo);
const buffer = await logoBinary.buffer(); const buffer = await logoBinary.buffer();
const logoFilename = Path.basename(logo); const logoFilename = Path.basename(logo);
sharp(buffer) sharp(buffer)