From d73bf50a2d9c1a102baf5e833e6af9960ac78b6d Mon Sep 17 00:00:00 2001 From: Rodrigo Rodriguez Date: Fri, 17 May 2024 19:19:58 -0300 Subject: [PATCH] new(all): Website indexing. --- package.json | 2 + .../basic.gblib/services/SystemKeywords.ts | 4 +- packages/core.gbapp/services/GBMinService.ts | 18 +- packages/gpt.gblib/services/ChatServices.ts | 188 ++--- packages/kb.gbapp/services/KBService.ts | 698 +++++++++++------- 5 files changed, 499 insertions(+), 411 deletions(-) diff --git a/package.json b/package.json index bec9d334..cd6adbdd 100644 --- a/package.json +++ b/package.json @@ -133,8 +133,10 @@ "express": "4.18.2", "express-remove-route": "1.0.0", "ffmpeg-static": "5.1.0", + "get-image-colors": "^4.0.1", "google-libphonenumber": "3.2.31", "googleapis": "126.0.1", + "hnswlib-node": "^1.4.2", "http-proxy": "1.18.1", "ibm-watson": "7.1.2", "iso-639-1": "3.1.1", diff --git a/packages/basic.gblib/services/SystemKeywords.ts b/packages/basic.gblib/services/SystemKeywords.ts index db28ac77..89c7d8a6 100644 --- a/packages/basic.gblib/services/SystemKeywords.ts +++ b/packages/basic.gblib/services/SystemKeywords.ts @@ -2055,7 +2055,9 @@ export class SystemKeywords { return (orientation || 0) >= 5 ? [height, width] : [width, height]; }; - const size = getNormalSize(await sharp(buf).metadata()); + const metadata = await sharp(buf).metadata(); + const size = getNormalSize({width:metadata['width'], + height:metadata['height'], orientation: metadata['orientation'] }); url = urlJoin(GBServer.globals.publicAddress, min.botId, 'cache', Path.basename(imageName)); images[index++] = { url: url, size: size, buf: buf }; } diff --git a/packages/core.gbapp/services/GBMinService.ts b/packages/core.gbapp/services/GBMinService.ts index 4a51f3f2..e037a0e9 100644 --- a/packages/core.gbapp/services/GBMinService.ts +++ b/packages/core.gbapp/services/GBMinService.ts @@ -169,36 +169,20 @@ export class GBMinService { let i = 1; if (instances.length > 1) { - this.bar1 = new cliProgress.SingleBar( - { - format: '[{bar}] ({value}/{total}) Loading {botId} ...', - barsize: 40, - forceRedraw: true - }, - cliProgress.Presets.rect - ); - this.bar1.start(instances.length, i, { botId: 'Boot' }); } await CollectionUtil.asyncForEach( instances, (async instance => { try { + GBLog.info(`Mounting ${instance.botId}...`) await this['mountBot'](instance); } catch (error) { GBLog.error(`Error mounting bot ${instance.botId}: ${error.message}\n${error.stack}`); - } finally { - if (this.bar1) { - this.bar1.update(i++, { botId: instance.botId }); - } } }).bind(this) ); - if (this.bar1) { - this.bar1.stop(); - } - // Loads API. await this.ensureAPI(); diff --git a/packages/gpt.gblib/services/ChatServices.ts b/packages/gpt.gblib/services/ChatServices.ts index 664736c2..2f93cc19 100644 --- a/packages/gpt.gblib/services/ChatServices.ts +++ b/packages/gpt.gblib/services/ChatServices.ts @@ -31,14 +31,19 @@ 'use strict'; import { HNSWLib } from '@langchain/community/vectorstores/hnswlib'; -import { StringOutputParser } from "@langchain/core/output_parsers"; -import { AIMessagePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder } from '@langchain/core/prompts'; -import { RunnableSequence } from "@langchain/core/runnables"; -import { convertToOpenAITool } from "@langchain/core/utils/function_calling"; -import { ChatOpenAI } from "@langchain/openai"; +import { StringOutputParser } from '@langchain/core/output_parsers'; +import { + AIMessagePromptTemplate, + ChatPromptTemplate, + HumanMessagePromptTemplate, + MessagesPlaceholder +} from '@langchain/core/prompts'; +import { RunnableSequence } from '@langchain/core/runnables'; +import { convertToOpenAITool } from '@langchain/core/utils/function_calling'; +import { ChatOpenAI } from '@langchain/openai'; import { GBLog, GBMinInstance } from 'botlib'; import * as Fs from 'fs'; -import { jsonSchemaToZod } from "json-schema-to-zod"; +import { jsonSchemaToZod } from 'json-schema-to-zod'; import { BufferWindowMemory } from 'langchain/memory'; import Path from 'path'; import { CollectionUtil } from 'pragmatismo-io-framework'; @@ -46,35 +51,28 @@ import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js'; import { GBVMService } from '../../basic.gblib/services/GBVMService.js'; import { GBConfigService } from '../../core.gbapp/services/GBConfigService.js'; import { GuaribasSubject } from '../../kb.gbapp/models/index.js'; -import { Serialized } from "@langchain/core/load/serializable"; -import { BaseCallbackHandler } from "@langchain/core/callbacks/base"; +import { Serialized } from '@langchain/core/load/serializable'; +import { BaseCallbackHandler } from '@langchain/core/callbacks/base'; import { pdfToPng, PngPageOutput } from 'pdf-to-png-converter'; -import { DynamicStructuredTool } from "@langchain/core/tools"; -import { WikipediaQueryRun } from "@langchain/community/tools/wikipedia_query_run"; -import { - BaseLLMOutputParser, - OutputParserException, -} from "@langchain/core/output_parsers"; -import { ChatGeneration, Generation } from "@langchain/core/outputs"; +import { DynamicStructuredTool } from '@langchain/core/tools'; +import { WikipediaQueryRun } from '@langchain/community/tools/wikipedia_query_run'; +import { BaseLLMOutputParser, OutputParserException } from '@langchain/core/output_parsers'; +import { ChatGeneration, Generation } from '@langchain/core/outputs'; import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js'; import { GBServer } from '../../../src/app.js'; import urlJoin from 'url-join'; -import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs"; +import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'; import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js'; - -export interface CustomOutputParserFields { } +export interface CustomOutputParserFields {} export type ExpectedOutput = any; -function isChatGeneration( - llmOutput: ChatGeneration | Generation -): llmOutput is ChatGeneration { - return "message" in llmOutput; +function isChatGeneration(llmOutput: ChatGeneration | Generation): llmOutput is ChatGeneration { + return 'message' in llmOutput; } class CustomHandler extends BaseCallbackHandler { - name = "custom_handler"; - + name = 'custom_handler'; handleLLMNewToken(token: string) { GBLogEx.info(0, `LLM: token: ${JSON.stringify(token)}`); @@ -95,11 +93,10 @@ class CustomHandler extends BaseCallbackHandler { const logHandler = new CustomHandler(); -export class GBLLMOutputParser extends - BaseLLMOutputParser { - lc_namespace = ["langchain", "output_parsers"]; +export class GBLLMOutputParser extends BaseLLMOutputParser { + lc_namespace = ['langchain', 'output_parsers']; - private toolChain: RunnableSequence + private toolChain: RunnableSequence; private min; constructor(min, toolChain: RunnableSequence, documentChain: RunnableSequence) { @@ -108,14 +105,9 @@ export class GBLLMOutputParser extends this.toolChain = toolChain; } - async parseResult( - llmOutputs: ChatGeneration[] | Generation[] - ): Promise { - + async parseResult(llmOutputs: ChatGeneration[] | Generation[]): Promise { if (!llmOutputs.length) { - throw new OutputParserException( - "Output parser did not receive any generations." - ); + throw new OutputParserException('Output parser did not receive any generations.'); } let result; @@ -140,10 +132,9 @@ export class GBLLMOutputParser extends let { sources, text } = res; - await CollectionUtil.asyncForEach(sources, async (source) => { + await CollectionUtil.asyncForEach(sources, async source => { let found = false; - if (source) { - + if (source && source.file.endsWith('.pdf')) { const gbaiName = DialogKeywords.getGBAIPath(this.min.botId, 'gbkb'); const localName = Path.join(process.env.PWD, 'work', gbaiName, 'docs', source.file); @@ -166,9 +157,7 @@ export class GBLLMOutputParser extends } export class ChatServices { - public static async pdfPageAsImage(min, filename, pageNumber) { - // Converts the PDF to PNG. GBLogEx.info(min, `Converting ${filename}, page: ${pageNumber}...`); @@ -181,7 +170,6 @@ export class ChatServices { verbosityLevel: 0 }); - // Prepare an image on cache and return the GBFILE information. if (pngPages.length > 0) { @@ -199,7 +187,6 @@ export class ChatServices { sanitizedQuestion: string, numDocuments: number = 100 ): Promise { - if (sanitizedQuestion === '') { return ''; } @@ -219,10 +206,12 @@ export class ChatServices { const doc = uniqueDocuments[filePaths]; const metadata = doc.metadata; const filename = Path.basename(metadata.source); - const page = await ChatServices.findPageForText(metadata.source, - doc.pageContent); + let page = 0; + if (metadata.source.endsWith('.pdf')) { + page = await ChatServices.findPageForText(metadata.source, doc.pageContent); + } - output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page} + output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page?page:'entire document'} (you will fill the JSON sources collection field later), memorize this block among document information and return when you are refering this part of content:\n\n\n\n ${doc.pageContent} \n\n\n\n.`; } @@ -233,12 +222,15 @@ export class ChatServices { const data = new Uint8Array(Fs.readFileSync(pdfPath)); const pdf = await getDocument({ data }).promise; - searchText = searchText.replace(/\s/g, '') + searchText = searchText.replace(/\s/g, ''); for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const textContent = await page.getTextContent(); - const text = textContent.items.map(item => item['str']).join('').replace(/\s/g, ''); + const text = textContent.items + .map(item => item['str']) + .join('') + .replace(/\s/g, ''); if (text.includes(searchText)) return i; } @@ -247,34 +239,31 @@ export class ChatServices { } /** - * Generate text - * - * CONTINUE keword. - * - * result = CONTINUE text - * - */ - public static async continue(min: GBMinInstance, question: string, chatId) { - - } + * Generate text + * + * CONTINUE keword. + * + * result = CONTINUE text + * + */ + public static async continue(min: GBMinInstance, question: string, chatId) {} private static memoryMap = {}; public static userSystemPrompt = {}; - public static async answerByGPT(min: GBMinInstance, user, pid, + public static async answerByGPT( + min: GBMinInstance, + user, + pid, question: string, searchScore: number, subjects: GuaribasSubject[] ) { - if (!process.env.OPENAI_API_KEY) { return { answer: undefined, questionId: 0 }; } - const LLMMode = min.core.getParam( - min.instance, - 'Answer Mode', 'direct' - ); + const LLMMode = min.core.getParam(min.instance, 'Answer Mode', 'direct'); const docsContext = min['vectorStore']; @@ -283,20 +272,19 @@ export class ChatServices { returnMessages: true, memoryKey: 'chat_history', inputKey: 'input', - k: 2, - }) + k: 2 + }); } const memory = this.memoryMap[user.userSystemId]; const systemPrompt = this.userSystemPrompt[user.userSystemId]; const model = new ChatOpenAI({ openAIApiKey: process.env.OPENAI_API_KEY, - modelName: "gpt-3.5-turbo-0125", + modelName: 'gpt-3.5-turbo-0125', temperature: 0, - callbacks: [logHandler], + callbacks: [logHandler] }); - let tools = await ChatServices.getTools(min); let toolsAsText = ChatServices.getToolsAsText(tools); @@ -316,9 +304,9 @@ export class ChatServices { Do not use any previous tools output in the chat_history. ` ), - new MessagesPlaceholder("chat_history"), + new MessagesPlaceholder('chat_history'), AIMessagePromptTemplate.fromTemplate(`Follow Up Input: {question} - Standalone question:`), + Standalone question:`) ]); const toolsResultPrompt = ChatPromptTemplate.fromMessages([ @@ -327,9 +315,9 @@ export class ChatServices { rephrase the answer to the user using this tool output. ` ), - new MessagesPlaceholder("chat_history"), + new MessagesPlaceholder('chat_history'), AIMessagePromptTemplate.fromTemplate(`Tool output: {tool_output} - Standalone question:`), + Standalone question:`) ]); const combineDocumentsPrompt = ChatPromptTemplate.fromMessages([ @@ -355,14 +343,13 @@ export class ChatServices { Double check if the output is a valid JSON with brackets. all fields are required: text, file, page. ` ), - new MessagesPlaceholder("chat_history"), - HumanMessagePromptTemplate.fromTemplate("Question: {question}"), + new MessagesPlaceholder('chat_history'), + HumanMessagePromptTemplate.fromTemplate('Question: {question}') ]); const callToolChain = RunnableSequence.from([ { tool_output: async (output: object) => { - const name = output['func'][0].function.name; const args = JSON.parse(output['func'][0].function.arguments); GBLogEx.info(min, `Running .gbdialog '${name}' as GPT tool...`); @@ -373,8 +360,7 @@ export class ChatServices { chat_history: async () => { const { chat_history } = await memory.loadMemoryVariables({}); return chat_history; - }, - + } }, toolsResultPrompt, model, @@ -391,8 +377,7 @@ export class ChatServices { context: async (output: string) => { const c = await ChatServices.getRelevantContext(docsContext, output); return `${systemPrompt} \n ${c ? 'Use this context to answer:\n' + c : 'answer just with user question.'}`; - - }, + } }, combineDocumentsPrompt, model, @@ -405,7 +390,7 @@ export class ChatServices { chat_history: async () => { const { chat_history } = await memory.loadMemoryVariables({}); return chat_history; - }, + } }, questionGeneratorTemplate, modelWithTools, @@ -416,45 +401,36 @@ export class ChatServices { let result, sources; let text, file, page; - - // Choose the operation mode of answer generation, based on + // Choose the operation mode of answer generation, based on // .gbot switch LLMMode and choose the corresponding chain. - if (LLMMode === "direct") { + if (LLMMode === 'direct') { result = await (tools.length > 0 ? modelWithTools : model).invoke(` ${systemPrompt} ${question}`); result = result.content; - } - else if (LLMMode === "document") { - + } else if (LLMMode === 'document') { const res = await combineDocumentsChain.invoke(question); result = res.text; sources = res.sources; - - } else if (LLMMode === "function") { - + } else if (LLMMode === 'function') { result = await conversationalToolChain.invoke({ - question, + question }); - } - else if (LLMMode === "full") { - + } else if (LLMMode === 'full') { throw new Error('Not implemented.'); // TODO: #407. - } - - else { + } else { GBLogEx.info(min, `Invalid Answer Mode in Config.xlsx: ${LLMMode}.`); } await memory.saveContext( { - input: question, + input: question }, { - output: result.replace(/\!\[.*\)/gi, '') // Removes .MD url beforing adding to history. + output: result?result.replace(/\!\[.*\)/gi, ''): 'no answer' // Removes .MD url beforing adding to history. } ); @@ -464,40 +440,34 @@ export class ChatServices { private static getToolsAsText(tools) { return Object.keys(tools) - .map((toolname) => `- ${tools[toolname].name}: ${tools[toolname].description}`) - .join("\n"); + .map(toolname => `- ${tools[toolname].name}: ${tools[toolname].description}`) + .join('\n'); } private static async getTools(min: GBMinInstance) { let functions = []; // Adds .gbdialog as functions if any to GPT Functions. - await CollectionUtil.asyncForEach(Object.keys(min.scriptMap), async (script) => { - - - const path = DialogKeywords.getGBAIPath(min.botId, "gbdialog", null); + await CollectionUtil.asyncForEach(Object.keys(min.scriptMap), async script => { + const path = DialogKeywords.getGBAIPath(min.botId, 'gbdialog', null); const jsonFile = Path.join('work', path, `${script}.json`); if (Fs.existsSync(jsonFile) && script.toLowerCase() !== 'start.vbs') { - const funcJSON = JSON.parse(Fs.readFileSync(jsonFile, 'utf8')); const funcObj = funcJSON?.function; if (funcObj) { - // TODO: Use ajv. funcObj.schema = eval(jsonSchemaToZod(funcObj.parameters)); functions.push(new DynamicStructuredTool(funcObj)); } } - }); if (process.env.WIKIPEDIA_TOOL) { - const tool = new WikipediaQueryRun({ topKResults: 3, - maxDocContentLength: 4000, + maxDocContentLength: 4000 }); functions.push(tool); } diff --git a/packages/kb.gbapp/services/KBService.ts b/packages/kb.gbapp/services/KBService.ts index 217a82d0..58a7a5dc 100644 --- a/packages/kb.gbapp/services/KBService.ts +++ b/packages/kb.gbapp/services/KBService.ts @@ -48,8 +48,11 @@ import { DocxLoader } from 'langchain/document_loaders/fs/docx'; import { EPubLoader } from 'langchain/document_loaders/fs/epub'; import { CSVLoader } from 'langchain/document_loaders/fs/csv'; import path from 'path'; +import puppeteer, { Page } from 'puppeteer'; import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { Document } from 'langchain/document'; +import getColors from 'get-image-colors'; + import { GBDialogStep, @@ -81,7 +84,6 @@ import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js'; import { GBMinService } from '../../core.gbapp/services/GBMinService.js'; import { ChatServices } from '../../gpt.gblib/services/ChatServices.js'; - /** * Result for quey on KB data. */ @@ -271,7 +273,7 @@ export class KBService implements IGBKBService { min: GBMinInstance, user, step, - pid, + pid, query: string, searchScore: number, subjects: GuaribasSubject[] @@ -293,13 +295,9 @@ export class KBService implements IGBKBService { 'Default Content Language', GBConfigService.get('DEFAULT_CONTENT_LANGUAGE') ); - - query = await min.conversationalService.translate( - min, - query, - contentLocale - ); - + + query = await min.conversationalService.translate(min, query, contentLocale); + GBLogEx.info(min, `Translated query (prompt): ${query}.`); // Try simple search first. @@ -317,10 +315,8 @@ export class KBService implements IGBKBService { } } let returnedScore = 0; - const key = instance.searchKey ? instance.searchKey : - GBServer.globals.minBoot.instance.searchKey; - const host = instance.searchHost ? instance.searchHost : - GBServer.globals.minBoot.instance.searchHost; + const key = instance.searchKey ? instance.searchKey : GBServer.globals.minBoot.instance.searchKey; + const host = instance.searchHost ? instance.searchHost : GBServer.globals.minBoot.instance.searchHost; // No direct match found, so Search is used. @@ -348,8 +344,6 @@ export class KBService implements IGBKBService { top: 1 }); - - // Searches via Search (Azure Search). let found = false; @@ -359,11 +353,15 @@ export class KBService implements IGBKBService { if (returnedScore >= searchScore) { const value = await this.getAnswerById(instance.instanceId, result.document.answerId); if (value !== null) { - GBLogEx.info(min, `SEARCH WILL BE USED with score: ${returnedScore} > required (searchScore): ${searchScore}`); + GBLogEx.info( + min, + `SEARCH WILL BE USED with score: ${returnedScore} > required (searchScore): ${searchScore}` + ); return { answer: value, questionId: result.document.questionId }; } else { - GBLogEx.info(min, + GBLogEx.info( + min, `Index problem. SEARCH WILL NOT be used as answerId ${result.document.answerId} was not found in database, returnedScore: ${returnedScore} < required (searchScore): ${searchScore}` ); @@ -373,17 +371,13 @@ export class KBService implements IGBKBService { } } } - GBLogEx.info(min, + GBLogEx.info( + min, `SEARCH returned LOW level score, calling NLP if any, returnedScore: ${returnedScore} < required (searchScore): ${searchScore}` ); - return await ChatServices.answerByGPT(min, user, pid, - query, - searchScore, - subjects - ); - + return await ChatServices.answerByGPT(min, user, pid, query, searchScore, subjects); } public async getSubjectItems(instanceId: number, parentId: number): Promise { @@ -626,7 +620,7 @@ export class KBService implements IGBKBService { } public async sendAnswer(min: GBMinInstance, channel: string, step: GBDialogStep, answer) { - answer = typeof (answer) === 'string' ? answer : answer.content; + answer = typeof answer === 'string' ? answer : answer.content; if (answer.endsWith('.mp4')) { await this.playVideo(min, min.conversationalService, step, answer, channel); } else if ( @@ -646,14 +640,11 @@ export class KBService implements IGBKBService { const url = urlJoin('kb', path, 'assets', answer); await this.playUrl(min, min.conversationalService, step, url, channel); } else if (answer.format === '.md') { - await min.conversationalService['playMarkdown'](min, answer, channel, step, - GBMinService.userMobile(step)); + await min.conversationalService['playMarkdown'](min, answer, channel, step, GBMinService.userMobile(step)); } else if (answer.endsWith('.ogg') && process.env.AUDIO_DISABLED !== 'true') { await this.playAudio(min, answer, channel, step, min.conversationalService); } else { - await min.conversationalService.sendText(min, step, answer); - } } @@ -685,7 +676,6 @@ export class KBService implements IGBKBService { const a = await GuaribasAnswer.create(answer); question['answerId'] = a.answerId; const q = await GuaribasQuestion.create(question); - } public async importKbPackage( @@ -747,7 +737,7 @@ export class KBService implements IGBKBService { const localName = Path.join('work', path, 'articles', file.name); let loader = new DocxLoader(localName); let doc = await loader.load(); - + const answer = { instanceId: instance.instanceId, content: doc[0].pageContent, @@ -758,10 +748,6 @@ export class KBService implements IGBKBService { }; data.answers.push(answer); - - - - } else if (file !== null && file.name.endsWith('.toc.docx')) { const path = DialogKeywords.getGBAIPath(instance.botId, `gbkb`); const localName = Path.join('work', path, 'articles', file.name); @@ -866,6 +852,103 @@ export class KBService implements IGBKBService { }); } + async saveHtmlPage(min, url: string, page: Page): Promise { + const response = await page.goto(url); + + + if (response.headers && response.status() === 200) { + const contentType = response.headers()['content-type']; + if (contentType && contentType.includes('text/html')) { + const buffer = await response.buffer(); + const urlObj = new URL(url); + const urlPath = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slash if present + let filename = urlPath.split('/').pop() || 'index'; // Get the filename from the URL path or set it to 'index.html' as default + filename = `${filename}.html`; + let path = DialogKeywords.getGBAIPath(min.botId, `gbot`); + const directoryPath = Path.join(process.env.PWD, 'work', path, 'Website'); + const filePath = Path.join(directoryPath, filename); + + GBLogEx.info(min, `[GBDeployer] Saving Website file in ${filePath}.`); + + Fs.mkdirSync(directoryPath, { recursive: true }); // Create directory recursively if it doesn't exist + Fs.writeFileSync(filePath, buffer); + return filePath; + } + } + return null; + } + + async crawl(min, url: string, visited: Set, depth: number, maxDepth: number, page: Page): Promise { + try { + if ( + depth > maxDepth || + (visited.has(url) || + url.endsWith('.jpg') || + url.endsWith('.pdf') || + url.endsWith('.jpg') || + url.endsWith('.png') || + url.endsWith('.mp4')) + ) { + return []; + } + + await GBLogEx.info(min, `Processing URL: ${url}.`); + + visited.add(url); + + const filename = await this.saveHtmlPage(min, url, page); + + if (!filename) { + // If the URL doesn't represent an HTML page, skip crawling its links + return []; + } + const currentDomain = new URL(page.url()).hostname; + let links = await page.evaluate(currentDomain => { + const anchors = Array.from(document.querySelectorAll('a')).filter(p => { + try { + return currentDomain == new URL(p.href).hostname; + } catch (err) { + return false; + } + }); + + return anchors.map(anchor => { + return anchor.href.replace(/#.*/, ''); + }); + }, currentDomain); + + if (!Array.isArray(links)) { + links = []; + } + + let filteredLinks = []; + + if (links && typeof links[Symbol.iterator] === 'function') { + filteredLinks = links.filter(l => { + try { + new URL(l); // Check if the link is a valid URL + return !visited.has(l); + } catch (error) { + // Ignore invalid URLs + return false; + } + }); + } + + const childLinks = []; + for (const link of filteredLinks) { + const links = await this.crawl(min, link, visited, depth + 1, maxDepth, page); + if (links){ + childLinks.push(...links); + } + } + + return [filename, ...childLinks]; // Include the filename of the cached file + } catch (error) { + await GBLogEx.info(min, error); + } + } + /** * Import all .docx files in reading comprehension folder. */ @@ -875,11 +958,52 @@ export class KBService implements IGBKBService { instance: IGBInstance, packageId: number ): Promise { - const files = await walkPromise(urlJoin(localPath, 'docs')); + let files = []; + + const website = min.core.getParam(min.instance, 'Website', null); + + if (website) { + const browser = await puppeteer.launch({ headless: false }); + const page = await browser.newPage(); + const response = await page.goto(website); + + await page.screenshot({ path: 'screenshot.png' }); + + // Extract dominant colors from the screenshot + const colors = await getColors('screenshot.png'); + + // Assuming you want the two most dominant colors + const mainColor1 = colors[0].hex(); + const mainColor2 = colors[1].hex(); + + console.log('Main Color 1:', mainColor1); + console.log('Main Color 2:', mainColor2); + + + const maxDepth = 1; // Maximum depth of recursion + const visited = new Set(); + files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page)); + + await browser.close(); + + files.shift(); + + await CollectionUtil.asyncForEach(files, async file => { + let content = null; + + const document = await this.loadAndSplitFile(file); + const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []); + const vectorStore = min['vectorStore']; + await vectorStore.addDocuments(flattenedDocuments); + await vectorStore.save(min['vectorStorePath']); + }); + + } + + files = await walkPromise(urlJoin(localPath, 'docs')); + if (!files[0]) { - GBLogEx.info(min, - `[GBDeployer] docs folder not created yet in .gbkb. To use Reading Comprehension, create this folder at root and put a document to get read by the.` - ); + GBLogEx.info(min, `[GBDeployer] docs folder not created yet in .gbkb neither a website in .gbot.`); } else { await CollectionUtil.asyncForEach(files, async file => { let content = null; @@ -894,211 +1018,217 @@ export class KBService implements IGBKBService { } } - defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter({ + defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 700, - chunkOverlap: 50, + chunkOverlap: 50 }); - - markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', { + + markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', { chunkSize: 700, - chunkOverlap: 50, + chunkOverlap: 50 }); - private async loadAndSplitFile(filePath: string): Promise>[]> { - const fileExtension = path.extname(filePath); - let loader; - let documents: Document>[]; - switch (fileExtension) { - case '.json': - loader = new JSONLoader(filePath); - documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); - break; - case '.txt': - loader = new TextLoader(filePath); - documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); - break; - case '.md': - loader = new TextLoader(filePath); - documents = await loader.loadAndSplit(this.markdownRecursiveCharacterTextSplitter); - break; - case '.pdf': - loader = new PDFLoader(filePath, { splitPages: false }); - documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); - break; - case '.docx': - loader = new DocxLoader(filePath); - documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); - break; - case '.csv': - loader = new CSVLoader(filePath); - documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); - break; - case '.epub': - loader = new EPubLoader(filePath, { splitChapters: false }); - documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); - break; - default: - throw new Error(`Unsupported file extension: ${fileExtension}`); - } - return documents; -} - - - public async importKbTabularDirectory(localPath: string, min: GBMinInstance, packageId: number): Promise < any > { - const files = await walkPromise(localPath); - - await CollectionUtil.asyncForEach(files, async file => { - if (file !== null && file.name.endsWith('.xlsx')) { - return await this.importKbTabularFile(urlJoin(file.root, file.name), min, packageId); + const fileExtension = path.extname(filePath); + let loader; + let documents: Document>[]; + switch (fileExtension) { + case '.json': + loader = new JSONLoader(filePath); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + case '.txt': + loader = new TextLoader(filePath); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + case '.txt': + loader = new TextLoader(filePath); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + case '.html': + loader = new TextLoader(filePath); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + case '.md': + loader = new TextLoader(filePath); + documents = await loader.loadAndSplit(this.markdownRecursiveCharacterTextSplitter); + break; + case '.pdf': + loader = new PDFLoader(filePath, { splitPages: false }); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + case '.docx': + loader = new DocxLoader(filePath); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + case '.csv': + loader = new CSVLoader(filePath); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + case '.epub': + loader = new EPubLoader(filePath, { splitChapters: false }); + documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter); + break; + default: + throw new Error(`Unsupported file extension: ${fileExtension}`); } - }); -} + return documents; + } + + public async importKbTabularDirectory(localPath: string, min: GBMinInstance, packageId: number): Promise { + const files = await walkPromise(localPath); + + await CollectionUtil.asyncForEach(files, async file => { + if (file !== null && file.name.endsWith('.xlsx')) { + return await this.importKbTabularFile(urlJoin(file.root, file.name), min, packageId); + } + }); + } public async importSubjectFile( - packageId: number, - filename: string, - menuFile: string, - instance: IGBInstance -): Promise < any > { - let subjectsLoaded; - if(menuFile) { - // Loads menu.xlsx and finds worksheet. + packageId: number, + filename: string, + menuFile: string, + instance: IGBInstance + ): Promise { + let subjectsLoaded; + if (menuFile) { + // Loads menu.xlsx and finds worksheet. - const workbook = new Excel.Workbook(); - const data = await workbook.xlsx.readFile(menuFile); - let worksheet: any; - for (let t = 0; t < data.worksheets.length; t++) { - worksheet = data.worksheets[t]; - if (worksheet) { - break; - } - } - - const MAX_LEVEL = 4; // Max column level to reach menu items in plan. - // Iterates over all items. - - let rows = worksheet._rows; - rows.length = 24; - let lastLevel = 0; - let subjects = { children: [] }; - let childrenNode = subjects.children; - let activeObj = null; - - let activeChildrenGivenLevel = [childrenNode]; - - await asyncPromise.eachSeries(rows, async row => { - if (!row) return; - let menu; - - // Detect menu level by skipping blank cells on left. - - let level; - for (level = 0; level < MAX_LEVEL; level++) { - const cell = row._cells[level]; - if (cell && cell.text) { - menu = cell.text; + const workbook = new Excel.Workbook(); + const data = await workbook.xlsx.readFile(menuFile); + let worksheet: any; + for (let t = 0; t < data.worksheets.length; t++) { + worksheet = data.worksheets[t]; + if (worksheet) { break; } } - // Tree hierarchy calculation. + const MAX_LEVEL = 4; // Max column level to reach menu items in plan. + // Iterates over all items. - if (level > lastLevel) { - childrenNode = activeObj.children; - } else if (level < lastLevel) { - childrenNode = activeChildrenGivenLevel[level]; - } + let rows = worksheet._rows; + rows.length = 24; + let lastLevel = 0; + let subjects = { children: [] }; + let childrenNode = subjects.children; + let activeObj = null; - /// Keeps the record of last subroots for each level, to - // changel levels greater than one (return to main menu), - // can exists between leaf nodes and roots. + let activeChildrenGivenLevel = [childrenNode]; - activeChildrenGivenLevel[level] = childrenNode; + await asyncPromise.eachSeries(rows, async row => { + if (!row) return; + let menu; - // Insert the object into JSON. - const description = row._cells[level + 1] ? row._cells[level + 1].text : null; - activeObj = { - title: menu, - description: description, - id: menu, - children: [] - }; - activeChildrenGivenLevel[level].push(activeObj); + // Detect menu level by skipping blank cells on left. - lastLevel = level; - }); + let level; + for (level = 0; level < MAX_LEVEL; level++) { + const cell = row._cells[level]; + if (cell && cell.text) { + menu = cell.text; + break; + } + } - subjectsLoaded = subjects; - } else { - subjectsLoaded = JSON.parse(Fs.readFileSync(filename, 'utf8')); - } + // Tree hierarchy calculation. - const doIt = async (subjects: GuaribasSubject[], parentSubjectId: number) => { - return asyncPromise.eachSeries(subjects, async item => { - const value = await GuaribasSubject.create({ - internalId: item.id, - parentSubjectId: parentSubjectId, - instanceId: instance.instanceId, - from: item.from, - to: item.to, - title: item.title, - description: item.description, - packageId: packageId + if (level > lastLevel) { + childrenNode = activeObj.children; + } else if (level < lastLevel) { + childrenNode = activeChildrenGivenLevel[level]; + } + + /// Keeps the record of last subroots for each level, to + // changel levels greater than one (return to main menu), + // can exists between leaf nodes and roots. + + activeChildrenGivenLevel[level] = childrenNode; + + // Insert the object into JSON. + const description = row._cells[level + 1] ? row._cells[level + 1].text : null; + activeObj = { + title: menu, + description: description, + id: menu, + children: [] + }; + activeChildrenGivenLevel[level].push(activeObj); + + lastLevel = level; }); - if (item.children) { - return doIt(item.children, value.subjectId); - } else { - return item; - } - }); - }; + subjectsLoaded = subjects; + } else { + subjectsLoaded = JSON.parse(Fs.readFileSync(filename, 'utf8')); + } - return doIt(subjectsLoaded.children, undefined); -} + const doIt = async (subjects: GuaribasSubject[], parentSubjectId: number) => { + return asyncPromise.eachSeries(subjects, async item => { + const value = await GuaribasSubject.create({ + internalId: item.id, + parentSubjectId: parentSubjectId, + instanceId: instance.instanceId, + from: item.from, + to: item.to, + title: item.title, + description: item.description, + packageId: packageId + }); + + if (item.children) { + return doIt(item.children, value.subjectId); + } else { + return item; + } + }); + }; + + return doIt(subjectsLoaded.children, undefined); + } public async undeployKbFromStorage(instance: IGBInstance, deployer: GBDeployer, packageId: number) { - await GuaribasQuestion.destroy({ - where: { instanceId: instance.instanceId, packageId: packageId } - }); - await GuaribasAnswer.destroy({ - where: { instanceId: instance.instanceId, packageId: packageId } - }); - await GuaribasSubject.destroy({ - where: { instanceId: instance.instanceId, packageId: packageId } - }); - await this.undeployPackageFromStorage(instance, packageId); -} + await GuaribasQuestion.destroy({ + where: { instanceId: instance.instanceId, packageId: packageId } + }); + await GuaribasAnswer.destroy({ + where: { instanceId: instance.instanceId, packageId: packageId } + }); + await GuaribasSubject.destroy({ + where: { instanceId: instance.instanceId, packageId: packageId } + }); + await this.undeployPackageFromStorage(instance, packageId); + } public static async RefreshNER(min: GBMinInstance) { - const questions = await KBService.getQuestionsNER(min.instance.instanceId); - const contentLocale = min.core.getParam( - min.instance, - 'Default Content Language', - GBConfigService.get('DEFAULT_CONTENT_LANGUAGE') - ); + const questions = await KBService.getQuestionsNER(min.instance.instanceId); + const contentLocale = min.core.getParam( + min.instance, + 'Default Content Language', + GBConfigService.get('DEFAULT_CONTENT_LANGUAGE') + ); - await CollectionUtil.asyncForEach(questions, async question => { - const text = question.content; + await CollectionUtil.asyncForEach(questions, async question => { + const text = question.content; - const categoryReg = /.*\((.*)\).*/gi.exec(text); - const nameReg = /(\w+)\(.*\).*/gi.exec(text); + const categoryReg = /.*\((.*)\).*/gi.exec(text); + const nameReg = /(\w+)\(.*\).*/gi.exec(text); - if (categoryReg) { - let category = categoryReg[1]; + if (categoryReg) { + let category = categoryReg[1]; - if (category === 'number') { - min['nerEngine'].addRegexEntity('number', 'pt', '/d+/gi'); + if (category === 'number') { + min['nerEngine'].addRegexEntity('number', 'pt', '/d+/gi'); + } + if (nameReg) { + let name = nameReg[1]; + + min['nerEngine'].addNamedEntityText(category, name, [contentLocale], [name]); + } } - if (nameReg) { - let name = nameReg[1]; - - min['nerEngine'].addNamedEntityText(category, name, [contentLocale], [name]); - } - } - }); -} + }); + } /** * Deploys a knowledge base to the storage using the .gbkb format. @@ -1106,90 +1236,90 @@ export class KBService implements IGBKBService { * @param localPath Path to the .gbkb folder. */ public async deployKb(core: IGBCoreService, deployer: GBDeployer, localPath: string, min: GBMinInstance) { - const packageName = Path.basename(localPath); - const instance = await core.loadInstanceByBotId(min.botId); - GBLogEx.info(min, `[GBDeployer] Importing: ${localPath}`); + const packageName = Path.basename(localPath); + const instance = await core.loadInstanceByBotId(min.botId); + GBLogEx.info(min, `[GBDeployer] Importing: ${localPath}`); - const p = await deployer.deployPackageToStorage(instance.instanceId, packageName); - await this.importKbPackage(min, localPath, p, instance); - GBDeployer.mountGBKBAssets(packageName, min.botId, localPath); - const service = await AzureDeployerService.createInstance(deployer); - const searchIndex = instance.searchIndex ? instance.searchIndex : GBServer.globals.minBoot.instance.searchIndex; - await deployer.rebuildIndex(instance, service.getKBSearchSchema(searchIndex)); + const p = await deployer.deployPackageToStorage(instance.instanceId, packageName); + await this.importKbPackage(min, localPath, p, instance); + GBDeployer.mountGBKBAssets(packageName, min.botId, localPath); + const service = await AzureDeployerService.createInstance(deployer); + const searchIndex = instance.searchIndex ? instance.searchIndex : GBServer.globals.minBoot.instance.searchIndex; + await deployer.rebuildIndex(instance, service.getKBSearchSchema(searchIndex)); - min['groupCache'] = await KBService.getGroupReplies(instance.instanceId); - await KBService.RefreshNER(min); + min['groupCache'] = await KBService.getGroupReplies(instance.instanceId); + await KBService.RefreshNER(min); - GBLogEx.info(min, `[GBDeployer] Start Bot Server Side Rendering... ${localPath}`); - const html = await GBSSR.getHTML(min); - let path = DialogKeywords.getGBAIPath(min.botId, `gbui`); - path = Path.join(process.env.PWD, 'work', path, 'index.html'); - GBLogEx.info(min, `[GBDeployer] Saving SSR HTML in ${path}.`); - Fs.writeFileSync(path, html, 'utf8'); + GBLogEx.info(min, `[GBDeployer] Start Bot Server Side Rendering... ${localPath}`); + const html = await GBSSR.getHTML(min); + let path = DialogKeywords.getGBAIPath(min.botId, `gbui`); + path = Path.join(process.env.PWD, 'work', path, 'index.html'); + GBLogEx.info(min, `[GBDeployer] Saving SSR HTML in ${path}.`); + Fs.writeFileSync(path, html, 'utf8'); - GBLogEx.info(min, `[GBDeployer] Finished import of ${localPath}`); -} + GBLogEx.info(min, `[GBDeployer] Finished import of ${localPath}`); + } private async playAudio( - min: GBMinInstance, - answer: GuaribasAnswer, - channel: string, - step: GBDialogStep, - conversationalService: IGBConversationalService -) { - conversationalService.sendAudio(min, step, answer.content); -} + min: GBMinInstance, + answer: GuaribasAnswer, + channel: string, + step: GBDialogStep, + conversationalService: IGBConversationalService + ) { + conversationalService.sendAudio(min, step, answer.content); + } private async playUrl( - min, - conversationalService: IGBConversationalService, - step: GBDialogStep, - url: string, - channel: string -) { - if (channel === 'whatsapp') { - await min.conversationalService.sendFile(min, step, null, url, ''); - } else { - await conversationalService.sendEvent(min, step, 'play', { - playerType: 'url', - data: url - }); + min, + conversationalService: IGBConversationalService, + step: GBDialogStep, + url: string, + channel: string + ) { + if (channel === 'whatsapp') { + await min.conversationalService.sendFile(min, step, null, url, ''); + } else { + await conversationalService.sendEvent(min, step, 'play', { + playerType: 'url', + data: url + }); + } } -} private async playVideo( - min, - conversationalService: IGBConversationalService, - step: GBDialogStep, - answer: GuaribasAnswer, - channel: string -) { - if (channel === 'whatsapp') { - await min.conversationalService.sendFile(min, step, null, answer.content, ''); - } else { - const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`); - await conversationalService.sendEvent(min, step, 'play', { - playerType: 'video', - data: urlJoin(path, 'videos', answer.content) + min, + conversationalService: IGBConversationalService, + step: GBDialogStep, + answer: GuaribasAnswer, + channel: string + ) { + if (channel === 'whatsapp') { + await min.conversationalService.sendFile(min, step, null, answer.content, ''); + } else { + const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`); + await conversationalService.sendEvent(min, step, 'play', { + playerType: 'video', + data: urlJoin(path, 'videos', answer.content) + }); + } + } + + private async undeployPackageFromStorage(instance: any, packageId: number) { + await GuaribasPackage.destroy({ + where: { instanceId: instance.instanceId, packageId: packageId } + }); + } + + private async getTextFromFile(filename: string) { + return new Promise(async (resolve, reject) => { + textract.fromFileWithPath(filename, { preserveLineBreaks: true }, (error, text) => { + if (error) { + reject(error); + } else { + resolve(text); + } + }); }); } -} - - private async undeployPackageFromStorage(instance: any, packageId: number) { - await GuaribasPackage.destroy({ - where: { instanceId: instance.instanceId, packageId: packageId } - }); -} - - private async getTextFromFile(filename: string) { - return new Promise(async (resolve, reject) => { - textract.fromFileWithPath(filename, { preserveLineBreaks: true }, (error, text) => { - if (error) { - reject(error); - } else { - resolve(text); - } - }); - }); -} }