From 66e3295f1f9aa43a7ffc327f8e18399e225787ec Mon Sep 17 00:00:00 2001 From: Rodrigo Rodriguez Date: Wed, 26 Jun 2024 21:41:32 -0300 Subject: [PATCH] new(kb.gbapp): New option for website depth during Vector Retrieval. --- package.json | 1 + packages/kb.gbapp/services/KBService.ts | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index aed42f48..1a503395 100644 --- a/package.json +++ b/package.json @@ -136,6 +136,7 @@ "google-libphonenumber": "3.2.34", "googleapis": "126.0.1", "hnswlib-node": "3.0.0", + "html-to-md": "^0.8.5", "http-proxy": "1.18.1", "ibm-watson": "9.1.0", "iso-639-1": "3.1.2", diff --git a/packages/kb.gbapp/services/KBService.ts b/packages/kb.gbapp/services/KBService.ts index 985772de..d1c37766 100644 --- a/packages/kb.gbapp/services/KBService.ts +++ b/packages/kb.gbapp/services/KBService.ts @@ -32,6 +32,7 @@ * @fileoverview Knowledge base services and logic. */ +import html2md from 'html-to-md' import Path from 'path'; import Fs from 'fs'; import urlJoin from 'url-join'; @@ -861,7 +862,7 @@ export class KBService implements IGBKBService { if (response && response.headers && response.status() === 200) { const contentType = response.headers()['content-type']; if (contentType && contentType.includes('text/html')) { - const buffer = await page.$eval('*', el => el['innerText']); + const buffer = html2md(await response.text()); const urlObj = new URL(url); const urlPath = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slash if present let filename = urlPath.split('/').pop() || 'index'; // Get the filename from the URL path or set it to 'index.html' as default