new(kb.gbapp): New option for website depth during Vector Retrieval.
This commit is contained in:
parent
ee1fc0afc9
commit
66e3295f1f
2 changed files with 3 additions and 1 deletions
|
@ -136,6 +136,7 @@
|
||||||
"google-libphonenumber": "3.2.34",
|
"google-libphonenumber": "3.2.34",
|
||||||
"googleapis": "126.0.1",
|
"googleapis": "126.0.1",
|
||||||
"hnswlib-node": "3.0.0",
|
"hnswlib-node": "3.0.0",
|
||||||
|
"html-to-md": "^0.8.5",
|
||||||
"http-proxy": "1.18.1",
|
"http-proxy": "1.18.1",
|
||||||
"ibm-watson": "9.1.0",
|
"ibm-watson": "9.1.0",
|
||||||
"iso-639-1": "3.1.2",
|
"iso-639-1": "3.1.2",
|
||||||
|
|
|
@ -32,6 +32,7 @@
|
||||||
* @fileoverview Knowledge base services and logic.
|
* @fileoverview Knowledge base services and logic.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import html2md from 'html-to-md'
|
||||||
import Path from 'path';
|
import Path from 'path';
|
||||||
import Fs from 'fs';
|
import Fs from 'fs';
|
||||||
import urlJoin from 'url-join';
|
import urlJoin from 'url-join';
|
||||||
|
@ -861,7 +862,7 @@ export class KBService implements IGBKBService {
|
||||||
if (response && response.headers && response.status() === 200) {
|
if (response && response.headers && response.status() === 200) {
|
||||||
const contentType = response.headers()['content-type'];
|
const contentType = response.headers()['content-type'];
|
||||||
if (contentType && contentType.includes('text/html')) {
|
if (contentType && contentType.includes('text/html')) {
|
||||||
const buffer = await page.$eval('*', el => el['innerText']);
|
const buffer = html2md(await response.text());
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
const urlPath = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slash if present
|
const urlPath = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slash if present
|
||||||
let filename = urlPath.split('/').pop() || 'index'; // Get the filename from the URL path or set it to 'index.html' as default
|
let filename = urlPath.split('/').pop() || 'index'; // Get the filename from the URL path or set it to 'index.html' as default
|
||||||
|
|
Loading…
Add table
Reference in a new issue