fix (all): path and fs normalization.

This commit is contained in:
Rodrigo Rodriguez 2024-09-08 16:48:26 -03:00
parent fb348599cf
commit 145406cab3
3 changed files with 146 additions and 129 deletions

View file

@ -33,21 +33,9 @@
*/
'use strict';
import cliProgress from 'cli-progress';
import { DialogSet, TextPrompt } from 'botbuilder-dialogs';
import SwaggerClient from 'swagger-client';
import removeRoute from 'express-remove-route';
import AuthenticationContext from 'adal-node';
import { FacebookAdapter } from 'botbuilder-adapter-facebook';
import mkdirp from 'mkdirp';
import fs from 'fs/promises';
import arrayBufferToBuffer from 'arraybuffer-to-buffer';
import { NlpManager } from 'node-nlp';
import Koa from 'koa';
import { v2 as webdav } from 'webdav-server';
import { createRpcServer } from '@push-rpc/core';
import { start as startRouter } from '../../../packages/core.gbapp/services/router/bridge.js';
import wash from 'washyourmouthoutwithsoap';
import AuthenticationContext from 'adal-node';
import arrayBufferToBuffer from 'arraybuffer-to-buffer';
import {
AutoSaveStateMiddleware,
BotFrameworkAdapter,
@ -56,7 +44,9 @@ import {
TurnContext,
UserState
} from 'botbuilder';
import { AttachmentPrompt, ConfirmPrompt, OAuthPrompt, WaterfallDialog } from 'botbuilder-dialogs';
import { FacebookAdapter } from 'botbuilder-adapter-facebook';
import { AttachmentPrompt, ConfirmPrompt, DialogSet, OAuthPrompt, TextPrompt, WaterfallDialog } from 'botbuilder-dialogs';
import { MicrosoftAppCredentials } from 'botframework-connector';
import {
GBDialogStep,
GBLog,
@ -67,13 +57,33 @@ import {
IGBInstance,
IGBPackage
} from 'botlib';
import cliProgress from 'cli-progress';
import removeRoute from 'express-remove-route';
import fs from 'fs/promises';
import Koa from 'koa';
import mkdirp from 'mkdirp';
import { NlpManager } from 'node-nlp';
import path from 'path';
import { CollectionUtil } from 'pragmatismo-io-framework';
import { MicrosoftAppCredentials } from 'botframework-connector';
import SwaggerClient from 'swagger-client';
import urlJoin from 'url-join';
import wash from 'washyourmouthoutwithsoap';
import { v2 as webdav } from 'webdav-server';
import { start as startRouter } from '../../../packages/core.gbapp/services/router/bridge.js';
import { GBServer } from '../../../src/app.js';
import { GBUtil } from '../../../src/util.js';
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
import { GuaribasConversationMessage } from '../../analytics.gblib/models/index.js';
import { AnalyticsService } from '../../analytics.gblib/services/AnalyticsService.js';
import { createKoaHttpServer } from '../../basic.gblib/index.js';
import { DebuggerService } from '../../basic.gblib/services/DebuggerService.js';
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
import { ImageProcessingServices } from '../../basic.gblib/services/ImageProcessingServices.js';
import { ScheduleServices } from '../../basic.gblib/services/ScheduleServices.js';
import { SystemKeywords } from '../../basic.gblib/services/SystemKeywords.js';
import { WebAutomationServices } from '../../basic.gblib/services/WebAutomationServices.js';
import { GoogleChatDirectLine } from '../../google-chat.gblib/services/GoogleChatDirectLine.js';
import { AskDialogArgs } from '../../kb.gbapp/dialogs/AskDialog.js';
import { KBService } from '../../kb.gbapp/services/KBService.js';
import { SecService } from '../../security.gbapp/services/SecService.js';
@ -82,19 +92,8 @@ import { Messages } from '../strings.js';
import { GBConfigService } from './GBConfigService.js';
import { GBConversationalService } from './GBConversationalService.js';
import { GBDeployer } from './GBDeployer.js';
import urlJoin from 'url-join';
import { GoogleChatDirectLine } from '../../google-chat.gblib/services/GoogleChatDirectLine.js';
import { SystemKeywords } from '../../basic.gblib/services/SystemKeywords.js';
import path from 'path';
import { GBSSR } from './GBSSR.js';
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
import { GBLogEx } from './GBLogEx.js';
import { WebAutomationServices } from '../../basic.gblib/services/WebAutomationServices.js';
import { createKoaHttpServer } from '../../basic.gblib/index.js';
import { DebuggerService } from '../../basic.gblib/services/DebuggerService.js';
import { ImageProcessingServices } from '../../basic.gblib/services/ImageProcessingServices.js';
import { ScheduleServices } from '../../basic.gblib/services/ScheduleServices.js';
import { GBUtil } from '../../../src/util.js';
import { GBSSR } from './GBSSR.js';
/**
* Minimal service layer for a bot and encapsulation of BOT Framework calls.

View file

@ -31,26 +31,27 @@
/**
* @fileoverview Knowledge base services and logic.
*/
import path from 'path';
import fs from 'fs/promises';
import urlJoin from 'url-join';
import asyncPromise from 'async-promises';
import walkPromise from 'walk-promise';
import { SearchClient } from '@azure/search-documents';
import asyncPromise from 'async-promises';
import Excel from 'exceljs';
import getSlug from 'speakingurl';
import { GBServer } from '../../../src/app.js';
import fs from 'fs/promises';
import html2md from 'html-to-md';
import { JSONLoader } from 'langchain/document_loaders/fs/json';
import { TextLoader } from 'langchain/document_loaders/fs/text';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import path from 'path';
import getSlug from 'speakingurl';
import urlJoin from 'url-join';
import walkPromise from 'walk-promise';
import { GBServer } from '../../../src/app.js';
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
import { EPubLoader } from '@langchain/community/document_loaders/fs/epub';
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import puppeteer, { Page } from 'puppeteer';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from 'langchain/document';
import getColors from 'get-image-colors';
import { Document } from 'langchain/document';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import puppeteer, { Page } from 'puppeteer';
import {
GBDialogStep,
@ -61,27 +62,27 @@ import {
IGBInstance,
IGBKBService
} from 'botlib';
import mammoth from 'mammoth';
import { parse } from 'node-html-parser';
import pdf from 'pdf-extraction';
import { CollectionUtil } from 'pragmatismo-io-framework';
import { Op } from 'sequelize';
import { Sequelize } from 'sequelize-typescript';
import textract from 'textract';
import { GBUtil } from '../../../src/util.js';
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
import { AzureDeployerService } from '../../azuredeployer.gbapp/services/AzureDeployerService.js';
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
import { GuaribasPackage } from '../../core.gbapp/models/GBModel.js';
import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js';
import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js';
import { GBMinService } from '../../core.gbapp/services/GBMinService.js';
import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js';
import { ChatServices } from '../../llm.gblib/services/ChatServices.js';
import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js';
import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js';
import { parse } from 'node-html-parser';
import textract from 'textract';
import pdf from 'pdf-extraction';
import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js';
import mammoth from 'mammoth';
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
import { GBMinService } from '../../core.gbapp/services/GBMinService.js';
import { ChatServices } from '../../llm.gblib/services/ChatServices.js';
import { GBUtil } from '../../../src/util.js';
/**
* Result for quey on KB data.
@ -690,7 +691,7 @@ export class KBService implements IGBKBService {
// Imports menu.xlsx if any.
if (await GBUtil.exists(subjectFile) || await GBUtil.exists(menuFile)) {
if ((await GBUtil.exists(subjectFile)) || (await GBUtil.exists(menuFile))) {
await this.importSubjectFile(packageStorage.packageId, subjectFile, menuFile, instance);
}
@ -881,18 +882,16 @@ export class KBService implements IGBKBService {
return [];
}
await GBLogEx.info(min, `Processing URL: ${url}.`);
await GBLogEx.info(min, `Crawling: ${url}.`);
visited.add(url);
const packagePath = GBUtil.getGBAIPath(min.botId, `gbot`);
const directoryPath = path.join(process.env.PWD, 'work', packagePath, 'Website');
const filename = await GBUtil.savePage(url, page, directoryPath);
const filename = await KBService.savePage(min, url, page, directoryPath);
if (!filename) {
// If the URL doesn't represent an HTML/PDF page, skip crawling its links
return [];
}
const currentDomain = new URL(page.url()).hostname;
@ -1085,14 +1084,21 @@ export class KBService implements IGBKBService {
files.shift();
GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`);
await CollectionUtil.asyncForEach(files, async file => {
let content = null;
try {
const document = await this.loadAndSplitFile(file);
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
const vectorStore = min['vectorStore'];
await vectorStore.addDocuments(flattenedDocuments);
await vectorStore.save(min['vectorStorePath']);
} catch (error) {
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
}
});
}
@ -1420,4 +1426,73 @@ export class KBService implements IGBKBService {
});
});
}
public static async savePage(
min: GBMinInstance,
url: string,
page: Page,
directoryPath: string
): Promise<string | null> {
try {
// Check if the directory exists, create it if not.
const directoryExists = await GBUtil.exists(directoryPath);
if (!directoryExists) {
await fs.mkdir(directoryPath, { recursive: true }); // Create directory if it doesn't exist
}
// Check if the URL is for a downloadable file (e.g., .pdf).
if (
url.endsWith('.pdf') ||
url.endsWith('.docx') ||
url.endsWith('.csv') ||
url.endsWith('.epub') ||
url.endsWith('.xml') ||
url.endsWith('.json') ||
url.endsWith('.txt')
) {
const response = await fetch(url);
if (!response.ok) {
throw new Error('Failed to download the file');
}
const buffer = await response.arrayBuffer(); // Convert response to array buffer
const fileName = path.basename(url); // Extract file name from URL
const filePath = path.join(directoryPath, fileName); // Create file path
const data = new Uint8Array(buffer);
await fs.writeFile(filePath, data);
return filePath; // Return the saved file path
} else {
await page.goto(url, { waitUntil: 'networkidle2' });
const parsedUrl = new URL(url);
// Get the last part of the URL path or default to 'index' if empty
const pathParts = parsedUrl.pathname.split('/').filter(Boolean); // Remove empty parts
const lastPath = pathParts.length > 0 ? pathParts[pathParts.length - 1] : 'index';
const flatLastPath = lastPath.replace(/\W+/g, '-'); // Flatten the last part of the path
const fileName = `${flatLastPath}.html`;
const filePath = path.join(directoryPath, fileName);
const htmlContent = await page.content();
// Convert HTML to Markdown using html2md
const markdownContent = html2md(htmlContent);
// Write Markdown content to file
await fs.writeFile(filePath, markdownContent);
return filePath;
}
} catch (error) {
GBLogEx.info(min, `Cannot save: ${url}. ${GBUtil.toYAML(error)}`);
return null;
}
}
}

View file

@ -44,7 +44,6 @@ VerbosityLevel.WARNINGS=0;
VerbosityLevel.INFOS=0;
import { Page } from 'puppeteer';
import urljoin from 'url-join';
import html2md from 'html-to-md';
export class GBUtil {
public static repeat(chr, count) {
@ -105,7 +104,13 @@ export class GBUtil {
};
const extractedError = extractProps(data);
return YAML.stringify(extractedError);
// Inline formatting for logs
return YAML.stringify(extractedError, {
indent: 2, // Defines the indentation
flowLevel: -1, // Forces inline formatting
styles: { '!!null': 'canonical' } // Optional: Customize null display
} as any);
}
public static sleep(ms) {
@ -142,68 +147,6 @@ export class GBUtil {
}
}
public static async savePage(url: string, page: Page, directoryPath: string): Promise<string | null> {
try {
// Check if the directory exists, create it if not
const directoryExists = await this.fileExists(directoryPath);
if (!directoryExists) {
await fs.mkdir(directoryPath, { recursive: true }); // Create directory if it doesn't exist
}
// Check if the URL is for a downloadable file (e.g., .pdf)
if (url.endsWith('.pdf')) {
const response = await fetch(url);
if (!response.ok) {
throw new Error('Failed to download the file');
}
const buffer = await response.arrayBuffer(); // Convert response to array buffer
const fileName = path.basename(url); // Extract file name from URL
const filePath = path.join(directoryPath, fileName); // Create file path
const data = new Uint8Array(buffer);
const text = await GBUtil.getPdfText(data);
// Write the buffer to the file asynchronously
await fs.writeFile(filePath, text);
return filePath; // Return the saved file path
} else {
// Use Puppeteer for non-downloadable pages
const parsedUrl = new URL(url);
// Get the last part of the URL path or default to 'index' if empty
const pathParts = parsedUrl.pathname.split('/').filter(Boolean); // Remove empty parts
const lastPath = pathParts.length > 0 ? pathParts[pathParts.length - 1] : 'index';
const flatLastPath = lastPath.replace(/\W+/g, '-'); // Flatten the last part of the path
const fileName = `${flatLastPath}.html`;
const filePath = path.join(directoryPath, fileName);
const htmlContent = await page.content();
// Write HTML content asynchronously
await fs.writeFile(filePath, htmlContent);
return filePath;
}
} catch (error) {
console.error('Error saving page:', error);
return null;
}
}
public static async fileExists(filePath: string): Promise<boolean> {
try {
await fs.access(filePath);
return true;
} catch (error) {
return false;
}
}
public static async copyIfNewerRecursive(src, dest) {
if (!await GBUtil.exists(src)) {