From 145406cab3efe1a78c7d01838ae88e37760de5af Mon Sep 17 00:00:00 2001 From: Rodrigo Rodriguez Date: Sun, 8 Sep 2024 16:48:26 -0300 Subject: [PATCH] fix (all): path and fs normalization. --- packages/core.gbapp/services/GBMinService.ts | 55 ++++--- packages/kb.gbapp/services/KBService.ts | 145 ++++++++++++++----- src/util.ts | 75 ++-------- 3 files changed, 146 insertions(+), 129 deletions(-) diff --git a/packages/core.gbapp/services/GBMinService.ts b/packages/core.gbapp/services/GBMinService.ts index d5a88c2c..4a4147c7 100644 --- a/packages/core.gbapp/services/GBMinService.ts +++ b/packages/core.gbapp/services/GBMinService.ts @@ -33,21 +33,9 @@ */ 'use strict'; -import cliProgress from 'cli-progress'; -import { DialogSet, TextPrompt } from 'botbuilder-dialogs'; -import SwaggerClient from 'swagger-client'; -import removeRoute from 'express-remove-route'; -import AuthenticationContext from 'adal-node'; -import { FacebookAdapter } from 'botbuilder-adapter-facebook'; -import mkdirp from 'mkdirp'; -import fs from 'fs/promises'; -import arrayBufferToBuffer from 'arraybuffer-to-buffer'; -import { NlpManager } from 'node-nlp'; -import Koa from 'koa'; -import { v2 as webdav } from 'webdav-server'; import { createRpcServer } from '@push-rpc/core'; -import { start as startRouter } from '../../../packages/core.gbapp/services/router/bridge.js'; -import wash from 'washyourmouthoutwithsoap'; +import AuthenticationContext from 'adal-node'; +import arrayBufferToBuffer from 'arraybuffer-to-buffer'; import { AutoSaveStateMiddleware, BotFrameworkAdapter, @@ -56,7 +44,9 @@ import { TurnContext, UserState } from 'botbuilder'; -import { AttachmentPrompt, ConfirmPrompt, OAuthPrompt, WaterfallDialog } from 'botbuilder-dialogs'; +import { FacebookAdapter } from 'botbuilder-adapter-facebook'; +import { AttachmentPrompt, ConfirmPrompt, DialogSet, OAuthPrompt, TextPrompt, WaterfallDialog } from 'botbuilder-dialogs'; +import { MicrosoftAppCredentials } from 'botframework-connector'; import { GBDialogStep, GBLog, @@ -67,13 +57,33 @@ import { IGBInstance, IGBPackage } from 'botlib'; +import cliProgress from 'cli-progress'; +import removeRoute from 'express-remove-route'; +import fs from 'fs/promises'; +import Koa from 'koa'; +import mkdirp from 'mkdirp'; +import { NlpManager } from 'node-nlp'; +import path from 'path'; import { CollectionUtil } from 'pragmatismo-io-framework'; -import { MicrosoftAppCredentials } from 'botframework-connector'; +import SwaggerClient from 'swagger-client'; +import urlJoin from 'url-join'; +import wash from 'washyourmouthoutwithsoap'; +import { v2 as webdav } from 'webdav-server'; +import { start as startRouter } from '../../../packages/core.gbapp/services/router/bridge.js'; import { GBServer } from '../../../src/app.js'; +import { GBUtil } from '../../../src/util.js'; import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js'; import { GuaribasConversationMessage } from '../../analytics.gblib/models/index.js'; import { AnalyticsService } from '../../analytics.gblib/services/AnalyticsService.js'; +import { createKoaHttpServer } from '../../basic.gblib/index.js'; +import { DebuggerService } from '../../basic.gblib/services/DebuggerService.js'; +import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js'; import { GBVMService } from '../../basic.gblib/services/GBVMService.js'; +import { ImageProcessingServices } from '../../basic.gblib/services/ImageProcessingServices.js'; +import { ScheduleServices } from '../../basic.gblib/services/ScheduleServices.js'; +import { SystemKeywords } from '../../basic.gblib/services/SystemKeywords.js'; +import { WebAutomationServices } from '../../basic.gblib/services/WebAutomationServices.js'; +import { GoogleChatDirectLine } from '../../google-chat.gblib/services/GoogleChatDirectLine.js'; import { AskDialogArgs } from '../../kb.gbapp/dialogs/AskDialog.js'; import { KBService } from '../../kb.gbapp/services/KBService.js'; import { SecService } from '../../security.gbapp/services/SecService.js'; @@ -82,19 +92,8 @@ import { Messages } from '../strings.js'; import { GBConfigService } from './GBConfigService.js'; import { GBConversationalService } from './GBConversationalService.js'; import { GBDeployer } from './GBDeployer.js'; -import urlJoin from 'url-join'; -import { GoogleChatDirectLine } from '../../google-chat.gblib/services/GoogleChatDirectLine.js'; -import { SystemKeywords } from '../../basic.gblib/services/SystemKeywords.js'; -import path from 'path'; -import { GBSSR } from './GBSSR.js'; -import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js'; import { GBLogEx } from './GBLogEx.js'; -import { WebAutomationServices } from '../../basic.gblib/services/WebAutomationServices.js'; -import { createKoaHttpServer } from '../../basic.gblib/index.js'; -import { DebuggerService } from '../../basic.gblib/services/DebuggerService.js'; -import { ImageProcessingServices } from '../../basic.gblib/services/ImageProcessingServices.js'; -import { ScheduleServices } from '../../basic.gblib/services/ScheduleServices.js'; -import { GBUtil } from '../../../src/util.js'; +import { GBSSR } from './GBSSR.js'; /** * Minimal service layer for a bot and encapsulation of BOT Framework calls. diff --git a/packages/kb.gbapp/services/KBService.ts b/packages/kb.gbapp/services/KBService.ts index a731d104..5fbbfeb0 100644 --- a/packages/kb.gbapp/services/KBService.ts +++ b/packages/kb.gbapp/services/KBService.ts @@ -31,26 +31,27 @@ /** * @fileoverview Knowledge base services and logic. */ -import path from 'path'; -import fs from 'fs/promises'; -import urlJoin from 'url-join'; -import asyncPromise from 'async-promises'; -import walkPromise from 'walk-promise'; import { SearchClient } from '@azure/search-documents'; +import asyncPromise from 'async-promises'; import Excel from 'exceljs'; -import getSlug from 'speakingurl'; -import { GBServer } from '../../../src/app.js'; +import fs from 'fs/promises'; +import html2md from 'html-to-md'; import { JSONLoader } from 'langchain/document_loaders/fs/json'; import { TextLoader } from 'langchain/document_loaders/fs/text'; -import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; +import path from 'path'; +import getSlug from 'speakingurl'; +import urlJoin from 'url-join'; +import walkPromise from 'walk-promise'; +import { GBServer } from '../../../src/app.js'; +import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'; import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'; import { EPubLoader } from '@langchain/community/document_loaders/fs/epub'; -import { CSVLoader } from '@langchain/community/document_loaders/fs/csv'; +import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; -import puppeteer, { Page } from 'puppeteer'; -import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; -import { Document } from 'langchain/document'; import getColors from 'get-image-colors'; +import { Document } from 'langchain/document'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import puppeteer, { Page } from 'puppeteer'; import { GBDialogStep, @@ -61,27 +62,27 @@ import { IGBInstance, IGBKBService } from 'botlib'; +import mammoth from 'mammoth'; +import { parse } from 'node-html-parser'; +import pdf from 'pdf-extraction'; import { CollectionUtil } from 'pragmatismo-io-framework'; import { Op } from 'sequelize'; import { Sequelize } from 'sequelize-typescript'; +import textract from 'textract'; +import { GBUtil } from '../../../src/util.js'; +import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js'; import { AzureDeployerService } from '../../azuredeployer.gbapp/services/AzureDeployerService.js'; +import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js'; +import { GBVMService } from '../../basic.gblib/services/GBVMService.js'; import { GuaribasPackage } from '../../core.gbapp/models/GBModel.js'; import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js'; +import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js'; +import { GBMinService } from '../../core.gbapp/services/GBMinService.js'; +import { GBSSR } from '../../core.gbapp/services/GBSSR.js'; import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js'; +import { ChatServices } from '../../llm.gblib/services/ChatServices.js'; import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js'; import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js'; -import { parse } from 'node-html-parser'; -import textract from 'textract'; -import pdf from 'pdf-extraction'; -import { GBSSR } from '../../core.gbapp/services/GBSSR.js'; -import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js'; -import mammoth from 'mammoth'; -import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js'; -import { GBVMService } from '../../basic.gblib/services/GBVMService.js'; -import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js'; -import { GBMinService } from '../../core.gbapp/services/GBMinService.js'; -import { ChatServices } from '../../llm.gblib/services/ChatServices.js'; -import { GBUtil } from '../../../src/util.js'; /** * Result for quey on KB data. @@ -690,7 +691,7 @@ export class KBService implements IGBKBService { // Imports menu.xlsx if any. - if (await GBUtil.exists(subjectFile) || await GBUtil.exists(menuFile)) { + if ((await GBUtil.exists(subjectFile)) || (await GBUtil.exists(menuFile))) { await this.importSubjectFile(packageStorage.packageId, subjectFile, menuFile, instance); } @@ -881,18 +882,16 @@ export class KBService implements IGBKBService { return []; } - await GBLogEx.info(min, `Processing URL: ${url}.`); + await GBLogEx.info(min, `Crawling: ${url}.`); visited.add(url); - + const packagePath = GBUtil.getGBAIPath(min.botId, `gbot`); const directoryPath = path.join(process.env.PWD, 'work', packagePath, 'Website'); - const filename = await GBUtil.savePage(url, page, directoryPath); + const filename = await KBService.savePage(min, url, page, directoryPath); if (!filename) { - // If the URL doesn't represent an HTML/PDF page, skip crawling its links return []; - } const currentDomain = new URL(page.url()).hostname; @@ -1085,14 +1084,21 @@ export class KBService implements IGBKBService { files.shift(); + GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`); + await CollectionUtil.asyncForEach(files, async file => { let content = null; - const document = await this.loadAndSplitFile(file); - const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []); - const vectorStore = min['vectorStore']; - await vectorStore.addDocuments(flattenedDocuments); - await vectorStore.save(min['vectorStorePath']); + try { + const document = await this.loadAndSplitFile(file); + const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []); + const vectorStore = min['vectorStore']; + await vectorStore.addDocuments(flattenedDocuments); + await vectorStore.save(min['vectorStorePath']); + } catch (error) { + GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`); + } + }); } @@ -1420,4 +1426,73 @@ export class KBService implements IGBKBService { }); }); } + + public static async savePage( + min: GBMinInstance, + url: string, + page: Page, + directoryPath: string + ): Promise { + try { + + // Check if the directory exists, create it if not. + + const directoryExists = await GBUtil.exists(directoryPath); + if (!directoryExists) { + await fs.mkdir(directoryPath, { recursive: true }); // Create directory if it doesn't exist + } + + // Check if the URL is for a downloadable file (e.g., .pdf). + + if ( + url.endsWith('.pdf') || + url.endsWith('.docx') || + url.endsWith('.csv') || + url.endsWith('.epub') || + url.endsWith('.xml') || + url.endsWith('.json') || + url.endsWith('.txt') + ) { + const response = await fetch(url); + + if (!response.ok) { + throw new Error('Failed to download the file'); + } + + const buffer = await response.arrayBuffer(); // Convert response to array buffer + const fileName = path.basename(url); // Extract file name from URL + const filePath = path.join(directoryPath, fileName); // Create file path + + const data = new Uint8Array(buffer); + await fs.writeFile(filePath, data); + + return filePath; // Return the saved file path + } else { + await page.goto(url, { waitUntil: 'networkidle2' }); + + const parsedUrl = new URL(url); + + // Get the last part of the URL path or default to 'index' if empty + const pathParts = parsedUrl.pathname.split('/').filter(Boolean); // Remove empty parts + const lastPath = pathParts.length > 0 ? pathParts[pathParts.length - 1] : 'index'; + const flatLastPath = lastPath.replace(/\W+/g, '-'); // Flatten the last part of the path + + const fileName = `${flatLastPath}.html`; + const filePath = path.join(directoryPath, fileName); + + const htmlContent = await page.content(); + + // Convert HTML to Markdown using html2md + const markdownContent = html2md(htmlContent); + + // Write Markdown content to file + await fs.writeFile(filePath, markdownContent); + + return filePath; + } + } catch (error) { + GBLogEx.info(min, `Cannot save: ${url}. ${GBUtil.toYAML(error)}`); + return null; + } + } } diff --git a/src/util.ts b/src/util.ts index ab9130fd..ad014fac 100644 --- a/src/util.ts +++ b/src/util.ts @@ -44,7 +44,6 @@ VerbosityLevel.WARNINGS=0; VerbosityLevel.INFOS=0; import { Page } from 'puppeteer'; import urljoin from 'url-join'; -import html2md from 'html-to-md'; export class GBUtil { public static repeat(chr, count) { @@ -103,11 +102,17 @@ export class GBUtil { return acc; }, {}); }; - + const extractedError = extractProps(data); - return YAML.stringify(extractedError); + + // Inline formatting for logs + return YAML.stringify(extractedError, { + indent: 2, // Defines the indentation + flowLevel: -1, // Forces inline formatting + styles: { '!!null': 'canonical' } // Optional: Customize null display + } as any); } - + public static sleep(ms) { return new Promise(resolve => { setTimeout(resolve, ms); @@ -141,68 +146,6 @@ export class GBUtil { return false; // File does not exist } } - - public static async savePage(url: string, page: Page, directoryPath: string): Promise { - try { - // Check if the directory exists, create it if not - const directoryExists = await this.fileExists(directoryPath); - if (!directoryExists) { - await fs.mkdir(directoryPath, { recursive: true }); // Create directory if it doesn't exist - } - - // Check if the URL is for a downloadable file (e.g., .pdf) - if (url.endsWith('.pdf')) { - const response = await fetch(url); - - if (!response.ok) { - throw new Error('Failed to download the file'); - } - - const buffer = await response.arrayBuffer(); // Convert response to array buffer - const fileName = path.basename(url); // Extract file name from URL - const filePath = path.join(directoryPath, fileName); // Create file path - - const data = new Uint8Array(buffer); - const text = await GBUtil.getPdfText(data); - - // Write the buffer to the file asynchronously - await fs.writeFile(filePath, text); - - return filePath; // Return the saved file path - } else { - // Use Puppeteer for non-downloadable pages - - const parsedUrl = new URL(url); - - // Get the last part of the URL path or default to 'index' if empty - const pathParts = parsedUrl.pathname.split('/').filter(Boolean); // Remove empty parts - const lastPath = pathParts.length > 0 ? pathParts[pathParts.length - 1] : 'index'; - const flatLastPath = lastPath.replace(/\W+/g, '-'); // Flatten the last part of the path - - const fileName = `${flatLastPath}.html`; - const filePath = path.join(directoryPath, fileName); - - const htmlContent = await page.content(); - - // Write HTML content asynchronously - await fs.writeFile(filePath, htmlContent); - - return filePath; - } - } catch (error) { - console.error('Error saving page:', error); - return null; - } - } - - public static async fileExists(filePath: string): Promise { - try { - await fs.access(filePath); - return true; - } catch (error) { - return false; - } - } public static async copyIfNewerRecursive(src, dest) {