fix (all): path and fs normalization.
This commit is contained in:
		
							parent
							
								
									fb348599cf
								
							
						
					
					
						commit
						145406cab3
					
				
					 3 changed files with 146 additions and 129 deletions
				
			
		| 
						 | 
					@ -33,21 +33,9 @@
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
'use strict';
 | 
					'use strict';
 | 
				
			||||||
import cliProgress from 'cli-progress';
 | 
					 | 
				
			||||||
import { DialogSet, TextPrompt } from 'botbuilder-dialogs';
 | 
					 | 
				
			||||||
import SwaggerClient from 'swagger-client';
 | 
					 | 
				
			||||||
import removeRoute from 'express-remove-route';
 | 
					 | 
				
			||||||
import AuthenticationContext from 'adal-node';
 | 
					 | 
				
			||||||
import { FacebookAdapter } from 'botbuilder-adapter-facebook';
 | 
					 | 
				
			||||||
import mkdirp from 'mkdirp';
 | 
					 | 
				
			||||||
import fs from 'fs/promises'; 
 | 
					 | 
				
			||||||
import arrayBufferToBuffer from 'arraybuffer-to-buffer';
 | 
					 | 
				
			||||||
import { NlpManager } from 'node-nlp';
 | 
					 | 
				
			||||||
import Koa from 'koa';
 | 
					 | 
				
			||||||
import { v2 as webdav } from 'webdav-server';
 | 
					 | 
				
			||||||
import { createRpcServer } from '@push-rpc/core';
 | 
					import { createRpcServer } from '@push-rpc/core';
 | 
				
			||||||
import { start as startRouter } from '../../../packages/core.gbapp/services/router/bridge.js';
 | 
					import AuthenticationContext from 'adal-node';
 | 
				
			||||||
import wash from 'washyourmouthoutwithsoap';
 | 
					import arrayBufferToBuffer from 'arraybuffer-to-buffer';
 | 
				
			||||||
import {
 | 
					import {
 | 
				
			||||||
  AutoSaveStateMiddleware,
 | 
					  AutoSaveStateMiddleware,
 | 
				
			||||||
  BotFrameworkAdapter,
 | 
					  BotFrameworkAdapter,
 | 
				
			||||||
| 
						 | 
					@ -56,7 +44,9 @@ import {
 | 
				
			||||||
  TurnContext,
 | 
					  TurnContext,
 | 
				
			||||||
  UserState
 | 
					  UserState
 | 
				
			||||||
} from 'botbuilder';
 | 
					} from 'botbuilder';
 | 
				
			||||||
import { AttachmentPrompt, ConfirmPrompt, OAuthPrompt, WaterfallDialog } from 'botbuilder-dialogs';
 | 
					import { FacebookAdapter } from 'botbuilder-adapter-facebook';
 | 
				
			||||||
 | 
					import { AttachmentPrompt, ConfirmPrompt, DialogSet, OAuthPrompt, TextPrompt, WaterfallDialog } from 'botbuilder-dialogs';
 | 
				
			||||||
 | 
					import { MicrosoftAppCredentials } from 'botframework-connector';
 | 
				
			||||||
import {
 | 
					import {
 | 
				
			||||||
  GBDialogStep,
 | 
					  GBDialogStep,
 | 
				
			||||||
  GBLog,
 | 
					  GBLog,
 | 
				
			||||||
| 
						 | 
					@ -67,13 +57,33 @@ import {
 | 
				
			||||||
  IGBInstance,
 | 
					  IGBInstance,
 | 
				
			||||||
  IGBPackage
 | 
					  IGBPackage
 | 
				
			||||||
} from 'botlib';
 | 
					} from 'botlib';
 | 
				
			||||||
 | 
					import cliProgress from 'cli-progress';
 | 
				
			||||||
 | 
					import removeRoute from 'express-remove-route';
 | 
				
			||||||
 | 
					import fs from 'fs/promises';
 | 
				
			||||||
 | 
					import Koa from 'koa';
 | 
				
			||||||
 | 
					import mkdirp from 'mkdirp';
 | 
				
			||||||
 | 
					import { NlpManager } from 'node-nlp';
 | 
				
			||||||
 | 
					import path from 'path';
 | 
				
			||||||
import { CollectionUtil } from 'pragmatismo-io-framework';
 | 
					import { CollectionUtil } from 'pragmatismo-io-framework';
 | 
				
			||||||
import { MicrosoftAppCredentials } from 'botframework-connector';
 | 
					import SwaggerClient from 'swagger-client';
 | 
				
			||||||
 | 
					import urlJoin from 'url-join';
 | 
				
			||||||
 | 
					import wash from 'washyourmouthoutwithsoap';
 | 
				
			||||||
 | 
					import { v2 as webdav } from 'webdav-server';
 | 
				
			||||||
 | 
					import { start as startRouter } from '../../../packages/core.gbapp/services/router/bridge.js';
 | 
				
			||||||
import { GBServer } from '../../../src/app.js';
 | 
					import { GBServer } from '../../../src/app.js';
 | 
				
			||||||
 | 
					import { GBUtil } from '../../../src/util.js';
 | 
				
			||||||
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
 | 
					import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
 | 
				
			||||||
import { GuaribasConversationMessage } from '../../analytics.gblib/models/index.js';
 | 
					import { GuaribasConversationMessage } from '../../analytics.gblib/models/index.js';
 | 
				
			||||||
import { AnalyticsService } from '../../analytics.gblib/services/AnalyticsService.js';
 | 
					import { AnalyticsService } from '../../analytics.gblib/services/AnalyticsService.js';
 | 
				
			||||||
 | 
					import { createKoaHttpServer } from '../../basic.gblib/index.js';
 | 
				
			||||||
 | 
					import { DebuggerService } from '../../basic.gblib/services/DebuggerService.js';
 | 
				
			||||||
 | 
					import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
 | 
				
			||||||
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
 | 
					import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
 | 
				
			||||||
 | 
					import { ImageProcessingServices } from '../../basic.gblib/services/ImageProcessingServices.js';
 | 
				
			||||||
 | 
					import { ScheduleServices } from '../../basic.gblib/services/ScheduleServices.js';
 | 
				
			||||||
 | 
					import { SystemKeywords } from '../../basic.gblib/services/SystemKeywords.js';
 | 
				
			||||||
 | 
					import { WebAutomationServices } from '../../basic.gblib/services/WebAutomationServices.js';
 | 
				
			||||||
 | 
					import { GoogleChatDirectLine } from '../../google-chat.gblib/services/GoogleChatDirectLine.js';
 | 
				
			||||||
import { AskDialogArgs } from '../../kb.gbapp/dialogs/AskDialog.js';
 | 
					import { AskDialogArgs } from '../../kb.gbapp/dialogs/AskDialog.js';
 | 
				
			||||||
import { KBService } from '../../kb.gbapp/services/KBService.js';
 | 
					import { KBService } from '../../kb.gbapp/services/KBService.js';
 | 
				
			||||||
import { SecService } from '../../security.gbapp/services/SecService.js';
 | 
					import { SecService } from '../../security.gbapp/services/SecService.js';
 | 
				
			||||||
| 
						 | 
					@ -82,19 +92,8 @@ import { Messages } from '../strings.js';
 | 
				
			||||||
import { GBConfigService } from './GBConfigService.js';
 | 
					import { GBConfigService } from './GBConfigService.js';
 | 
				
			||||||
import { GBConversationalService } from './GBConversationalService.js';
 | 
					import { GBConversationalService } from './GBConversationalService.js';
 | 
				
			||||||
import { GBDeployer } from './GBDeployer.js';
 | 
					import { GBDeployer } from './GBDeployer.js';
 | 
				
			||||||
import urlJoin from 'url-join';
 | 
					 | 
				
			||||||
import { GoogleChatDirectLine } from '../../google-chat.gblib/services/GoogleChatDirectLine.js';
 | 
					 | 
				
			||||||
import { SystemKeywords } from '../../basic.gblib/services/SystemKeywords.js';
 | 
					 | 
				
			||||||
import path from 'path';
 | 
					 | 
				
			||||||
import { GBSSR } from './GBSSR.js';
 | 
					 | 
				
			||||||
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
 | 
					 | 
				
			||||||
import { GBLogEx } from './GBLogEx.js';
 | 
					import { GBLogEx } from './GBLogEx.js';
 | 
				
			||||||
import { WebAutomationServices } from '../../basic.gblib/services/WebAutomationServices.js';
 | 
					import { GBSSR } from './GBSSR.js';
 | 
				
			||||||
import { createKoaHttpServer } from '../../basic.gblib/index.js';
 | 
					 | 
				
			||||||
import { DebuggerService } from '../../basic.gblib/services/DebuggerService.js';
 | 
					 | 
				
			||||||
import { ImageProcessingServices } from '../../basic.gblib/services/ImageProcessingServices.js';
 | 
					 | 
				
			||||||
import { ScheduleServices } from '../../basic.gblib/services/ScheduleServices.js';
 | 
					 | 
				
			||||||
import { GBUtil } from '../../../src/util.js';
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * Minimal service layer for a bot and encapsulation of BOT Framework calls.
 | 
					 * Minimal service layer for a bot and encapsulation of BOT Framework calls.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,26 +31,27 @@
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * @fileoverview Knowledge base services and logic.
 | 
					 * @fileoverview Knowledge base services and logic.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
import path from 'path';
 | 
					 | 
				
			||||||
import fs from 'fs/promises'; 
 | 
					 | 
				
			||||||
import urlJoin from 'url-join';
 | 
					 | 
				
			||||||
import asyncPromise from 'async-promises';
 | 
					 | 
				
			||||||
import walkPromise from 'walk-promise';
 | 
					 | 
				
			||||||
import { SearchClient } from '@azure/search-documents';
 | 
					import { SearchClient } from '@azure/search-documents';
 | 
				
			||||||
 | 
					import asyncPromise from 'async-promises';
 | 
				
			||||||
import Excel from 'exceljs';
 | 
					import Excel from 'exceljs';
 | 
				
			||||||
import getSlug from 'speakingurl';
 | 
					import fs from 'fs/promises';
 | 
				
			||||||
import { GBServer } from '../../../src/app.js';
 | 
					import html2md from 'html-to-md';
 | 
				
			||||||
import { JSONLoader } from 'langchain/document_loaders/fs/json';
 | 
					import { JSONLoader } from 'langchain/document_loaders/fs/json';
 | 
				
			||||||
import { TextLoader } from 'langchain/document_loaders/fs/text';
 | 
					import { TextLoader } from 'langchain/document_loaders/fs/text';
 | 
				
			||||||
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
 | 
					import path from 'path';
 | 
				
			||||||
 | 
					import getSlug from 'speakingurl';
 | 
				
			||||||
 | 
					import urlJoin from 'url-join';
 | 
				
			||||||
 | 
					import walkPromise from 'walk-promise';
 | 
				
			||||||
 | 
					import { GBServer } from '../../../src/app.js';
 | 
				
			||||||
 | 
					import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
 | 
				
			||||||
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
 | 
					import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
 | 
				
			||||||
import { EPubLoader } from '@langchain/community/document_loaders/fs/epub';
 | 
					import { EPubLoader } from '@langchain/community/document_loaders/fs/epub';
 | 
				
			||||||
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
 | 
					import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import puppeteer, { Page } from 'puppeteer';
 | 
					 | 
				
			||||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 | 
					 | 
				
			||||||
import { Document } from 'langchain/document';
 | 
					 | 
				
			||||||
import getColors from 'get-image-colors';
 | 
					import getColors from 'get-image-colors';
 | 
				
			||||||
 | 
					import { Document } from 'langchain/document';
 | 
				
			||||||
 | 
					import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 | 
				
			||||||
 | 
					import puppeteer, { Page } from 'puppeteer';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import {
 | 
					import {
 | 
				
			||||||
  GBDialogStep,
 | 
					  GBDialogStep,
 | 
				
			||||||
| 
						 | 
					@ -61,27 +62,27 @@ import {
 | 
				
			||||||
  IGBInstance,
 | 
					  IGBInstance,
 | 
				
			||||||
  IGBKBService
 | 
					  IGBKBService
 | 
				
			||||||
} from 'botlib';
 | 
					} from 'botlib';
 | 
				
			||||||
 | 
					import mammoth from 'mammoth';
 | 
				
			||||||
 | 
					import { parse } from 'node-html-parser';
 | 
				
			||||||
 | 
					import pdf from 'pdf-extraction';
 | 
				
			||||||
import { CollectionUtil } from 'pragmatismo-io-framework';
 | 
					import { CollectionUtil } from 'pragmatismo-io-framework';
 | 
				
			||||||
import { Op } from 'sequelize';
 | 
					import { Op } from 'sequelize';
 | 
				
			||||||
import { Sequelize } from 'sequelize-typescript';
 | 
					import { Sequelize } from 'sequelize-typescript';
 | 
				
			||||||
 | 
					import textract from 'textract';
 | 
				
			||||||
 | 
					import { GBUtil } from '../../../src/util.js';
 | 
				
			||||||
 | 
					import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
 | 
				
			||||||
import { AzureDeployerService } from '../../azuredeployer.gbapp/services/AzureDeployerService.js';
 | 
					import { AzureDeployerService } from '../../azuredeployer.gbapp/services/AzureDeployerService.js';
 | 
				
			||||||
 | 
					import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
 | 
				
			||||||
 | 
					import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
 | 
				
			||||||
import { GuaribasPackage } from '../../core.gbapp/models/GBModel.js';
 | 
					import { GuaribasPackage } from '../../core.gbapp/models/GBModel.js';
 | 
				
			||||||
import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js';
 | 
					import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js';
 | 
				
			||||||
 | 
					import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js';
 | 
				
			||||||
 | 
					import { GBMinService } from '../../core.gbapp/services/GBMinService.js';
 | 
				
			||||||
 | 
					import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
 | 
				
			||||||
import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js';
 | 
					import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js';
 | 
				
			||||||
 | 
					import { ChatServices } from '../../llm.gblib/services/ChatServices.js';
 | 
				
			||||||
import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js';
 | 
					import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js';
 | 
				
			||||||
import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js';
 | 
					import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js';
 | 
				
			||||||
import { parse } from 'node-html-parser';
 | 
					 | 
				
			||||||
import textract from 'textract';
 | 
					 | 
				
			||||||
import pdf from 'pdf-extraction';
 | 
					 | 
				
			||||||
import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
 | 
					 | 
				
			||||||
import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js';
 | 
					 | 
				
			||||||
import mammoth from 'mammoth';
 | 
					 | 
				
			||||||
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
 | 
					 | 
				
			||||||
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
 | 
					 | 
				
			||||||
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
 | 
					 | 
				
			||||||
import { GBMinService } from '../../core.gbapp/services/GBMinService.js';
 | 
					 | 
				
			||||||
import { ChatServices } from '../../llm.gblib/services/ChatServices.js';
 | 
					 | 
				
			||||||
import { GBUtil } from '../../../src/util.js';
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * Result for quey on KB data.
 | 
					 * Result for quey on KB data.
 | 
				
			||||||
| 
						 | 
					@ -690,7 +691,7 @@ export class KBService implements IGBKBService {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Imports menu.xlsx if any.
 | 
					    // Imports menu.xlsx if any.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (await GBUtil.exists(subjectFile) || await GBUtil.exists(menuFile)) {
 | 
					    if ((await GBUtil.exists(subjectFile)) || (await GBUtil.exists(menuFile))) {
 | 
				
			||||||
      await this.importSubjectFile(packageStorage.packageId, subjectFile, menuFile, instance);
 | 
					      await this.importSubjectFile(packageStorage.packageId, subjectFile, menuFile, instance);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -881,18 +882,16 @@ export class KBService implements IGBKBService {
 | 
				
			||||||
        return [];
 | 
					        return [];
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      await GBLogEx.info(min, `Processing URL: ${url}.`);
 | 
					      await GBLogEx.info(min, `Crawling: ${url}.`);
 | 
				
			||||||
      visited.add(url);
 | 
					      visited.add(url);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      const packagePath = GBUtil.getGBAIPath(min.botId, `gbot`);
 | 
					      const packagePath = GBUtil.getGBAIPath(min.botId, `gbot`);
 | 
				
			||||||
      const directoryPath = path.join(process.env.PWD, 'work', packagePath, 'Website');
 | 
					      const directoryPath = path.join(process.env.PWD, 'work', packagePath, 'Website');
 | 
				
			||||||
      const filename = await GBUtil.savePage(url, page, directoryPath);
 | 
					      const filename = await KBService.savePage(min, url, page, directoryPath);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      if (!filename) {
 | 
					      if (!filename) {
 | 
				
			||||||
 | 
					 | 
				
			||||||
        // If the URL doesn't represent an HTML/PDF page, skip crawling its links
 | 
					        // If the URL doesn't represent an HTML/PDF page, skip crawling its links
 | 
				
			||||||
        return [];
 | 
					        return [];
 | 
				
			||||||
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      const currentDomain = new URL(page.url()).hostname;
 | 
					      const currentDomain = new URL(page.url()).hostname;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1085,14 +1084,21 @@ export class KBService implements IGBKBService {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      files.shift();
 | 
					      files.shift();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      await CollectionUtil.asyncForEach(files, async file => {
 | 
					      await CollectionUtil.asyncForEach(files, async file => {
 | 
				
			||||||
        let content = null;
 | 
					        let content = null;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try {
 | 
				
			||||||
          const document = await this.loadAndSplitFile(file);
 | 
					          const document = await this.loadAndSplitFile(file);
 | 
				
			||||||
          const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
 | 
					          const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
 | 
				
			||||||
          const vectorStore = min['vectorStore'];
 | 
					          const vectorStore = min['vectorStore'];
 | 
				
			||||||
          await vectorStore.addDocuments(flattenedDocuments);
 | 
					          await vectorStore.addDocuments(flattenedDocuments);
 | 
				
			||||||
          await vectorStore.save(min['vectorStorePath']);
 | 
					          await vectorStore.save(min['vectorStorePath']);
 | 
				
			||||||
 | 
					        } catch (error) {
 | 
				
			||||||
 | 
					          GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      });
 | 
					      });
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1420,4 +1426,73 @@ export class KBService implements IGBKBService {
 | 
				
			||||||
      });
 | 
					      });
 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  public static async savePage(
 | 
				
			||||||
 | 
					    min: GBMinInstance,
 | 
				
			||||||
 | 
					    url: string,
 | 
				
			||||||
 | 
					    page: Page,
 | 
				
			||||||
 | 
					    directoryPath: string
 | 
				
			||||||
 | 
					  ): Promise<string | null> {
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      // Check if the directory exists, create it if not.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const directoryExists = await GBUtil.exists(directoryPath);
 | 
				
			||||||
 | 
					      if (!directoryExists) {
 | 
				
			||||||
 | 
					        await fs.mkdir(directoryPath, { recursive: true }); // Create directory if it doesn't exist
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Check if the URL is for a downloadable file (e.g., .pdf).
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      if (
 | 
				
			||||||
 | 
					        url.endsWith('.pdf') ||
 | 
				
			||||||
 | 
					        url.endsWith('.docx') ||
 | 
				
			||||||
 | 
					        url.endsWith('.csv') ||
 | 
				
			||||||
 | 
					        url.endsWith('.epub') ||
 | 
				
			||||||
 | 
					        url.endsWith('.xml') ||
 | 
				
			||||||
 | 
					        url.endsWith('.json') ||
 | 
				
			||||||
 | 
					        url.endsWith('.txt')
 | 
				
			||||||
 | 
					      ) {
 | 
				
			||||||
 | 
					        const response = await fetch(url);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if (!response.ok) {
 | 
				
			||||||
 | 
					          throw new Error('Failed to download the file');
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        const buffer = await response.arrayBuffer(); // Convert response to array buffer
 | 
				
			||||||
 | 
					        const fileName = path.basename(url); // Extract file name from URL
 | 
				
			||||||
 | 
					        const filePath = path.join(directoryPath, fileName); // Create file path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        const data = new Uint8Array(buffer);
 | 
				
			||||||
 | 
					        await fs.writeFile(filePath, data);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return filePath; // Return the saved file path
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					        await page.goto(url, { waitUntil: 'networkidle2' });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        const parsedUrl = new URL(url);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Get the last part of the URL path or default to 'index' if empty
 | 
				
			||||||
 | 
					        const pathParts = parsedUrl.pathname.split('/').filter(Boolean); // Remove empty parts
 | 
				
			||||||
 | 
					        const lastPath = pathParts.length > 0 ? pathParts[pathParts.length - 1] : 'index';
 | 
				
			||||||
 | 
					        const flatLastPath = lastPath.replace(/\W+/g, '-'); // Flatten the last part of the path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        const fileName = `${flatLastPath}.html`;
 | 
				
			||||||
 | 
					        const filePath = path.join(directoryPath, fileName);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        const htmlContent = await page.content();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Convert HTML to Markdown using html2md
 | 
				
			||||||
 | 
					        const markdownContent = html2md(htmlContent);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Write Markdown content to file
 | 
				
			||||||
 | 
					        await fs.writeFile(filePath, markdownContent);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return filePath;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      GBLogEx.info(min, `Cannot save: ${url}. ${GBUtil.toYAML(error)}`);
 | 
				
			||||||
 | 
					      return null;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										71
									
								
								src/util.ts
									
										
									
									
									
								
							
							
						
						
									
										71
									
								
								src/util.ts
									
										
									
									
									
								
							| 
						 | 
					@ -44,7 +44,6 @@ VerbosityLevel.WARNINGS=0;
 | 
				
			||||||
VerbosityLevel.INFOS=0;
 | 
					VerbosityLevel.INFOS=0;
 | 
				
			||||||
import { Page } from 'puppeteer';
 | 
					import { Page } from 'puppeteer';
 | 
				
			||||||
import urljoin from 'url-join';
 | 
					import urljoin from 'url-join';
 | 
				
			||||||
import html2md from 'html-to-md';
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
export class GBUtil {
 | 
					export class GBUtil {
 | 
				
			||||||
  public static repeat(chr, count) {
 | 
					  public static repeat(chr, count) {
 | 
				
			||||||
| 
						 | 
					@ -105,7 +104,13 @@ export class GBUtil {
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
    const extractedError = extractProps(data);
 | 
					    const extractedError = extractProps(data);
 | 
				
			||||||
    return YAML.stringify(extractedError);
 | 
					    
 | 
				
			||||||
 | 
					    // Inline formatting for logs
 | 
				
			||||||
 | 
					    return YAML.stringify(extractedError, {
 | 
				
			||||||
 | 
					      indent: 2,          // Defines the indentation
 | 
				
			||||||
 | 
					      flowLevel: -1,       // Forces inline formatting
 | 
				
			||||||
 | 
					      styles: { '!!null': 'canonical' }  // Optional: Customize null display
 | 
				
			||||||
 | 
					    } as any);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  public static sleep(ms) {
 | 
					  public static sleep(ms) {
 | 
				
			||||||
| 
						 | 
					@ -142,68 +147,6 @@ export class GBUtil {
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 public static async savePage(url: string, page: Page, directoryPath: string): Promise<string | null> {
 | 
					 | 
				
			||||||
    try {
 | 
					 | 
				
			||||||
      // Check if the directory exists, create it if not
 | 
					 | 
				
			||||||
      const directoryExists = await this.fileExists(directoryPath);
 | 
					 | 
				
			||||||
      if (!directoryExists) {
 | 
					 | 
				
			||||||
        await fs.mkdir(directoryPath, { recursive: true }); // Create directory if it doesn't exist
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
      // Check if the URL is for a downloadable file (e.g., .pdf)
 | 
					 | 
				
			||||||
      if (url.endsWith('.pdf')) {
 | 
					 | 
				
			||||||
        const response = await fetch(url);
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
        if (!response.ok) {
 | 
					 | 
				
			||||||
          throw new Error('Failed to download the file');
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
        const buffer = await response.arrayBuffer(); // Convert response to array buffer
 | 
					 | 
				
			||||||
        const fileName = path.basename(url); // Extract file name from URL
 | 
					 | 
				
			||||||
        const filePath = path.join(directoryPath, fileName); // Create file path
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
        const data = new Uint8Array(buffer);
 | 
					 | 
				
			||||||
        const text = await GBUtil.getPdfText(data);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // Write the buffer to the file asynchronously
 | 
					 | 
				
			||||||
        await fs.writeFile(filePath, text);
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
        return filePath; // Return the saved file path
 | 
					 | 
				
			||||||
      } else {
 | 
					 | 
				
			||||||
        // Use Puppeteer for non-downloadable pages
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        const parsedUrl = new URL(url);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // Get the last part of the URL path or default to 'index' if empty
 | 
					 | 
				
			||||||
        const pathParts = parsedUrl.pathname.split('/').filter(Boolean); // Remove empty parts
 | 
					 | 
				
			||||||
        const lastPath = pathParts.length > 0 ? pathParts[pathParts.length - 1] : 'index';
 | 
					 | 
				
			||||||
        const flatLastPath = lastPath.replace(/\W+/g, '-'); // Flatten the last part of the path
 | 
					 | 
				
			||||||
        
 | 
					 | 
				
			||||||
        const fileName = `${flatLastPath}.html`;
 | 
					 | 
				
			||||||
        const filePath = path.join(directoryPath, fileName);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        const htmlContent = await page.content();
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
        // Write HTML content asynchronously
 | 
					 | 
				
			||||||
        await fs.writeFile(filePath, htmlContent);
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
        return filePath;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    } catch (error) {
 | 
					 | 
				
			||||||
      console.error('Error saving page:', error);
 | 
					 | 
				
			||||||
      return null;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  public static async  fileExists(filePath: string): Promise<boolean> {
 | 
					 | 
				
			||||||
    try {
 | 
					 | 
				
			||||||
      await fs.access(filePath);
 | 
					 | 
				
			||||||
      return true;
 | 
					 | 
				
			||||||
    } catch (error) {
 | 
					 | 
				
			||||||
      return false;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  public static async copyIfNewerRecursive(src, dest) {
 | 
					  public static async copyIfNewerRecursive(src, dest) {
 | 
				
			||||||
    if (!await GBUtil.exists(src)) {
 | 
					    if (!await GBUtil.exists(src)) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue