new(all): Website indexing.

This commit is contained in:
Rodrigo Rodriguez 2024-05-17 19:19:58 -03:00
parent c620473fbf
commit d73bf50a2d
5 changed files with 499 additions and 411 deletions

View file

@ -133,8 +133,10 @@
"express": "4.18.2",
"express-remove-route": "1.0.0",
"ffmpeg-static": "5.1.0",
"get-image-colors": "^4.0.1",
"google-libphonenumber": "3.2.31",
"googleapis": "126.0.1",
"hnswlib-node": "^1.4.2",
"http-proxy": "1.18.1",
"ibm-watson": "7.1.2",
"iso-639-1": "3.1.1",

View file

@ -2055,7 +2055,9 @@ export class SystemKeywords {
return (orientation || 0) >= 5 ? [height, width] : [width, height];
};
const size = getNormalSize(await sharp(buf).metadata());
const metadata = await sharp(buf).metadata();
const size = getNormalSize({width:metadata['width'],
height:metadata['height'], orientation: metadata['orientation'] });
url = urlJoin(GBServer.globals.publicAddress, min.botId, 'cache', Path.basename(imageName));
images[index++] = { url: url, size: size, buf: buf };
}

View file

@ -169,36 +169,20 @@ export class GBMinService {
let i = 1;
if (instances.length > 1) {
this.bar1 = new cliProgress.SingleBar(
{
format: '[{bar}] ({value}/{total}) Loading {botId} ...',
barsize: 40,
forceRedraw: true
},
cliProgress.Presets.rect
);
this.bar1.start(instances.length, i, { botId: 'Boot' });
}
await CollectionUtil.asyncForEach(
instances,
(async instance => {
try {
GBLog.info(`Mounting ${instance.botId}...`)
await this['mountBot'](instance);
} catch (error) {
GBLog.error(`Error mounting bot ${instance.botId}: ${error.message}\n${error.stack}`);
} finally {
if (this.bar1) {
this.bar1.update(i++, { botId: instance.botId });
}
}
}).bind(this)
);
if (this.bar1) {
this.bar1.stop();
}
// Loads API.
await this.ensureAPI();

View file

@ -31,14 +31,19 @@
'use strict';
import { HNSWLib } from '@langchain/community/vectorstores/hnswlib';
import { StringOutputParser } from "@langchain/core/output_parsers";
import { AIMessagePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder } from '@langchain/core/prompts';
import { RunnableSequence } from "@langchain/core/runnables";
import { convertToOpenAITool } from "@langchain/core/utils/function_calling";
import { ChatOpenAI } from "@langchain/openai";
import { StringOutputParser } from '@langchain/core/output_parsers';
import {
AIMessagePromptTemplate,
ChatPromptTemplate,
HumanMessagePromptTemplate,
MessagesPlaceholder
} from '@langchain/core/prompts';
import { RunnableSequence } from '@langchain/core/runnables';
import { convertToOpenAITool } from '@langchain/core/utils/function_calling';
import { ChatOpenAI } from '@langchain/openai';
import { GBLog, GBMinInstance } from 'botlib';
import * as Fs from 'fs';
import { jsonSchemaToZod } from "json-schema-to-zod";
import { jsonSchemaToZod } from 'json-schema-to-zod';
import { BufferWindowMemory } from 'langchain/memory';
import Path from 'path';
import { CollectionUtil } from 'pragmatismo-io-framework';
@ -46,35 +51,28 @@ import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
import { GBConfigService } from '../../core.gbapp/services/GBConfigService.js';
import { GuaribasSubject } from '../../kb.gbapp/models/index.js';
import { Serialized } from "@langchain/core/load/serializable";
import { BaseCallbackHandler } from "@langchain/core/callbacks/base";
import { Serialized } from '@langchain/core/load/serializable';
import { BaseCallbackHandler } from '@langchain/core/callbacks/base';
import { pdfToPng, PngPageOutput } from 'pdf-to-png-converter';
import { DynamicStructuredTool } from "@langchain/core/tools";
import { WikipediaQueryRun } from "@langchain/community/tools/wikipedia_query_run";
import {
BaseLLMOutputParser,
OutputParserException,
} from "@langchain/core/output_parsers";
import { ChatGeneration, Generation } from "@langchain/core/outputs";
import { DynamicStructuredTool } from '@langchain/core/tools';
import { WikipediaQueryRun } from '@langchain/community/tools/wikipedia_query_run';
import { BaseLLMOutputParser, OutputParserException } from '@langchain/core/output_parsers';
import { ChatGeneration, Generation } from '@langchain/core/outputs';
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
import { GBServer } from '../../../src/app.js';
import urlJoin from 'url-join';
import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js';
export interface CustomOutputParserFields {}
export type ExpectedOutput = any;
function isChatGeneration(
llmOutput: ChatGeneration | Generation
): llmOutput is ChatGeneration {
return "message" in llmOutput;
function isChatGeneration(llmOutput: ChatGeneration | Generation): llmOutput is ChatGeneration {
return 'message' in llmOutput;
}
class CustomHandler extends BaseCallbackHandler {
name = "custom_handler";
name = 'custom_handler';
handleLLMNewToken(token: string) {
GBLogEx.info(0, `LLM: token: ${JSON.stringify(token)}`);
@ -95,11 +93,10 @@ class CustomHandler extends BaseCallbackHandler {
const logHandler = new CustomHandler();
export class GBLLMOutputParser extends
BaseLLMOutputParser<ExpectedOutput> {
lc_namespace = ["langchain", "output_parsers"];
export class GBLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
lc_namespace = ['langchain', 'output_parsers'];
private toolChain: RunnableSequence
private toolChain: RunnableSequence;
private min;
constructor(min, toolChain: RunnableSequence, documentChain: RunnableSequence) {
@ -108,14 +105,9 @@ export class GBLLMOutputParser extends
this.toolChain = toolChain;
}
async parseResult(
llmOutputs: ChatGeneration[] | Generation[]
): Promise<ExpectedOutput> {
async parseResult(llmOutputs: ChatGeneration[] | Generation[]): Promise<ExpectedOutput> {
if (!llmOutputs.length) {
throw new OutputParserException(
"Output parser did not receive any generations."
);
throw new OutputParserException('Output parser did not receive any generations.');
}
let result;
@ -140,10 +132,9 @@ export class GBLLMOutputParser extends
let { sources, text } = res;
await CollectionUtil.asyncForEach(sources, async (source) => {
await CollectionUtil.asyncForEach(sources, async source => {
let found = false;
if (source) {
if (source && source.file.endsWith('.pdf')) {
const gbaiName = DialogKeywords.getGBAIPath(this.min.botId, 'gbkb');
const localName = Path.join(process.env.PWD, 'work', gbaiName, 'docs', source.file);
@ -166,9 +157,7 @@ export class GBLLMOutputParser extends
}
export class ChatServices {
public static async pdfPageAsImage(min, filename, pageNumber) {
// Converts the PDF to PNG.
GBLogEx.info(min, `Converting ${filename}, page: ${pageNumber}...`);
@ -181,7 +170,6 @@ export class ChatServices {
verbosityLevel: 0
});
// Prepare an image on cache and return the GBFILE information.
if (pngPages.length > 0) {
@ -199,7 +187,6 @@ export class ChatServices {
sanitizedQuestion: string,
numDocuments: number = 100
): Promise<string> {
if (sanitizedQuestion === '') {
return '';
}
@ -219,10 +206,12 @@ export class ChatServices {
const doc = uniqueDocuments[filePaths];
const metadata = doc.metadata;
const filename = Path.basename(metadata.source);
const page = await ChatServices.findPageForText(metadata.source,
doc.pageContent);
let page = 0;
if (metadata.source.endsWith('.pdf')) {
page = await ChatServices.findPageForText(metadata.source, doc.pageContent);
}
output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page}
output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page?page:'entire document'}
(you will fill the JSON sources collection field later),
memorize this block among document information and return when you are refering this part of content:\n\n\n\n ${doc.pageContent} \n\n\n\n.`;
}
@ -233,12 +222,15 @@ export class ChatServices {
const data = new Uint8Array(Fs.readFileSync(pdfPath));
const pdf = await getDocument({ data }).promise;
searchText = searchText.replace(/\s/g, '')
searchText = searchText.replace(/\s/g, '');
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const text = textContent.items.map(item => item['str']).join('').replace(/\s/g, '');
const text = textContent.items
.map(item => item['str'])
.join('')
.replace(/\s/g, '');
if (text.includes(searchText)) return i;
}
@ -254,27 +246,24 @@ export class ChatServices {
* result = CONTINUE text
*
*/
public static async continue(min: GBMinInstance, question: string, chatId) {
}
public static async continue(min: GBMinInstance, question: string, chatId) {}
private static memoryMap = {};
public static userSystemPrompt = {};
public static async answerByGPT(min: GBMinInstance, user, pid,
public static async answerByGPT(
min: GBMinInstance,
user,
pid,
question: string,
searchScore: number,
subjects: GuaribasSubject[]
) {
if (!process.env.OPENAI_API_KEY) {
return { answer: undefined, questionId: 0 };
}
const LLMMode = min.core.getParam(
min.instance,
'Answer Mode', 'direct'
);
const LLMMode = min.core.getParam(min.instance, 'Answer Mode', 'direct');
const docsContext = min['vectorStore'];
@ -283,20 +272,19 @@ export class ChatServices {
returnMessages: true,
memoryKey: 'chat_history',
inputKey: 'input',
k: 2,
})
k: 2
});
}
const memory = this.memoryMap[user.userSystemId];
const systemPrompt = this.userSystemPrompt[user.userSystemId];
const model = new ChatOpenAI({
openAIApiKey: process.env.OPENAI_API_KEY,
modelName: "gpt-3.5-turbo-0125",
modelName: 'gpt-3.5-turbo-0125',
temperature: 0,
callbacks: [logHandler],
callbacks: [logHandler]
});
let tools = await ChatServices.getTools(min);
let toolsAsText = ChatServices.getToolsAsText(tools);
@ -316,9 +304,9 @@ export class ChatServices {
Do not use any previous tools output in the chat_history.
`
),
new MessagesPlaceholder("chat_history"),
new MessagesPlaceholder('chat_history'),
AIMessagePromptTemplate.fromTemplate(`Follow Up Input: {question}
Standalone question:`),
Standalone question:`)
]);
const toolsResultPrompt = ChatPromptTemplate.fromMessages([
@ -327,9 +315,9 @@ export class ChatServices {
rephrase the answer to the user using this tool output.
`
),
new MessagesPlaceholder("chat_history"),
new MessagesPlaceholder('chat_history'),
AIMessagePromptTemplate.fromTemplate(`Tool output: {tool_output}
Standalone question:`),
Standalone question:`)
]);
const combineDocumentsPrompt = ChatPromptTemplate.fromMessages([
@ -355,14 +343,13 @@ export class ChatServices {
Double check if the output is a valid JSON with brackets. all fields are required: text, file, page.
`
),
new MessagesPlaceholder("chat_history"),
HumanMessagePromptTemplate.fromTemplate("Question: {question}"),
new MessagesPlaceholder('chat_history'),
HumanMessagePromptTemplate.fromTemplate('Question: {question}')
]);
const callToolChain = RunnableSequence.from([
{
tool_output: async (output: object) => {
const name = output['func'][0].function.name;
const args = JSON.parse(output['func'][0].function.arguments);
GBLogEx.info(min, `Running .gbdialog '${name}' as GPT tool...`);
@ -373,8 +360,7 @@ export class ChatServices {
chat_history: async () => {
const { chat_history } = await memory.loadMemoryVariables({});
return chat_history;
},
}
},
toolsResultPrompt,
model,
@ -391,8 +377,7 @@ export class ChatServices {
context: async (output: string) => {
const c = await ChatServices.getRelevantContext(docsContext, output);
return `${systemPrompt} \n ${c ? 'Use this context to answer:\n' + c : 'answer just with user question.'}`;
},
}
},
combineDocumentsPrompt,
model,
@ -405,7 +390,7 @@ export class ChatServices {
chat_history: async () => {
const { chat_history } = await memory.loadMemoryVariables({});
return chat_history;
},
}
},
questionGeneratorTemplate,
modelWithTools,
@ -416,45 +401,36 @@ export class ChatServices {
let result, sources;
let text, file, page;
// Choose the operation mode of answer generation, based on
// .gbot switch LLMMode and choose the corresponding chain.
if (LLMMode === "direct") {
if (LLMMode === 'direct') {
result = await (tools.length > 0 ? modelWithTools : model).invoke(`
${systemPrompt}
${question}`);
result = result.content;
}
else if (LLMMode === "document") {
} else if (LLMMode === 'document') {
const res = await combineDocumentsChain.invoke(question);
result = res.text;
sources = res.sources;
} else if (LLMMode === "function") {
} else if (LLMMode === 'function') {
result = await conversationalToolChain.invoke({
question,
question
});
}
else if (LLMMode === "full") {
} else if (LLMMode === 'full') {
throw new Error('Not implemented.'); // TODO: #407.
}
else {
} else {
GBLogEx.info(min, `Invalid Answer Mode in Config.xlsx: ${LLMMode}.`);
}
await memory.saveContext(
{
input: question,
input: question
},
{
output: result.replace(/\!\[.*\)/gi, '') // Removes .MD url beforing adding to history.
output: result?result.replace(/\!\[.*\)/gi, ''): 'no answer' // Removes .MD url beforing adding to history.
}
);
@ -464,40 +440,34 @@ export class ChatServices {
private static getToolsAsText(tools) {
return Object.keys(tools)
.map((toolname) => `- ${tools[toolname].name}: ${tools[toolname].description}`)
.join("\n");
.map(toolname => `- ${tools[toolname].name}: ${tools[toolname].description}`)
.join('\n');
}
private static async getTools(min: GBMinInstance) {
let functions = [];
// Adds .gbdialog as functions if any to GPT Functions.
await CollectionUtil.asyncForEach(Object.keys(min.scriptMap), async (script) => {
const path = DialogKeywords.getGBAIPath(min.botId, "gbdialog", null);
await CollectionUtil.asyncForEach(Object.keys(min.scriptMap), async script => {
const path = DialogKeywords.getGBAIPath(min.botId, 'gbdialog', null);
const jsonFile = Path.join('work', path, `${script}.json`);
if (Fs.existsSync(jsonFile) && script.toLowerCase() !== 'start.vbs') {
const funcJSON = JSON.parse(Fs.readFileSync(jsonFile, 'utf8'));
const funcObj = funcJSON?.function;
if (funcObj) {
// TODO: Use ajv.
funcObj.schema = eval(jsonSchemaToZod(funcObj.parameters));
functions.push(new DynamicStructuredTool(funcObj));
}
}
});
if (process.env.WIKIPEDIA_TOOL) {
const tool = new WikipediaQueryRun({
topKResults: 3,
maxDocContentLength: 4000,
maxDocContentLength: 4000
});
functions.push(tool);
}

View file

@ -48,8 +48,11 @@ import { DocxLoader } from 'langchain/document_loaders/fs/docx';
import { EPubLoader } from 'langchain/document_loaders/fs/epub';
import { CSVLoader } from 'langchain/document_loaders/fs/csv';
import path from 'path';
import puppeteer, { Page } from 'puppeteer';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from 'langchain/document';
import getColors from 'get-image-colors';
import {
GBDialogStep,
@ -81,7 +84,6 @@ import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
import { GBMinService } from '../../core.gbapp/services/GBMinService.js';
import { ChatServices } from '../../gpt.gblib/services/ChatServices.js';
/**
* Result for quey on KB data.
*/
@ -294,11 +296,7 @@ export class KBService implements IGBKBService {
GBConfigService.get('DEFAULT_CONTENT_LANGUAGE')
);
query = await min.conversationalService.translate(
min,
query,
contentLocale
);
query = await min.conversationalService.translate(min, query, contentLocale);
GBLogEx.info(min, `Translated query (prompt): ${query}.`);
@ -317,10 +315,8 @@ export class KBService implements IGBKBService {
}
}
let returnedScore = 0;
const key = instance.searchKey ? instance.searchKey :
GBServer.globals.minBoot.instance.searchKey;
const host = instance.searchHost ? instance.searchHost :
GBServer.globals.minBoot.instance.searchHost;
const key = instance.searchKey ? instance.searchKey : GBServer.globals.minBoot.instance.searchKey;
const host = instance.searchHost ? instance.searchHost : GBServer.globals.minBoot.instance.searchHost;
// No direct match found, so Search is used.
@ -348,8 +344,6 @@ export class KBService implements IGBKBService {
top: 1
});
// Searches via Search (Azure Search).
let found = false;
@ -359,11 +353,15 @@ export class KBService implements IGBKBService {
if (returnedScore >= searchScore) {
const value = await this.getAnswerById(instance.instanceId, result.document.answerId);
if (value !== null) {
GBLogEx.info(min, `SEARCH WILL BE USED with score: ${returnedScore} > required (searchScore): ${searchScore}`);
GBLogEx.info(
min,
`SEARCH WILL BE USED with score: ${returnedScore} > required (searchScore): ${searchScore}`
);
return { answer: value, questionId: result.document.questionId };
} else {
GBLogEx.info(min,
GBLogEx.info(
min,
`Index problem. SEARCH WILL NOT be used as answerId ${result.document.answerId} was not found in database,
returnedScore: ${returnedScore} < required (searchScore): ${searchScore}`
);
@ -373,17 +371,13 @@ export class KBService implements IGBKBService {
}
}
}
GBLogEx.info(min,
GBLogEx.info(
min,
`SEARCH returned LOW level score, calling NLP if any,
returnedScore: ${returnedScore} < required (searchScore): ${searchScore}`
);
return await ChatServices.answerByGPT(min, user, pid,
query,
searchScore,
subjects
);
return await ChatServices.answerByGPT(min, user, pid, query, searchScore, subjects);
}
public async getSubjectItems(instanceId: number, parentId: number): Promise<GuaribasSubject[]> {
@ -626,7 +620,7 @@ export class KBService implements IGBKBService {
}
public async sendAnswer(min: GBMinInstance, channel: string, step: GBDialogStep, answer) {
answer = typeof (answer) === 'string' ? answer : answer.content;
answer = typeof answer === 'string' ? answer : answer.content;
if (answer.endsWith('.mp4')) {
await this.playVideo(min, min.conversationalService, step, answer, channel);
} else if (
@ -646,14 +640,11 @@ export class KBService implements IGBKBService {
const url = urlJoin('kb', path, 'assets', answer);
await this.playUrl(min, min.conversationalService, step, url, channel);
} else if (answer.format === '.md') {
await min.conversationalService['playMarkdown'](min, answer, channel, step,
GBMinService.userMobile(step));
await min.conversationalService['playMarkdown'](min, answer, channel, step, GBMinService.userMobile(step));
} else if (answer.endsWith('.ogg') && process.env.AUDIO_DISABLED !== 'true') {
await this.playAudio(min, answer, channel, step, min.conversationalService);
} else {
await min.conversationalService.sendText(min, step, answer);
}
}
@ -685,7 +676,6 @@ export class KBService implements IGBKBService {
const a = await GuaribasAnswer.create(answer);
question['answerId'] = a.answerId;
const q = await GuaribasQuestion.create(question);
}
public async importKbPackage(
@ -758,10 +748,6 @@ export class KBService implements IGBKBService {
};
data.answers.push(answer);
} else if (file !== null && file.name.endsWith('.toc.docx')) {
const path = DialogKeywords.getGBAIPath(instance.botId, `gbkb`);
const localName = Path.join('work', path, 'articles', file.name);
@ -866,6 +852,103 @@ export class KBService implements IGBKBService {
});
}
async saveHtmlPage(min, url: string, page: Page): Promise<string | null> {
const response = await page.goto(url);
if (response.headers && response.status() === 200) {
const contentType = response.headers()['content-type'];
if (contentType && contentType.includes('text/html')) {
const buffer = await response.buffer();
const urlObj = new URL(url);
const urlPath = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slash if present
let filename = urlPath.split('/').pop() || 'index'; // Get the filename from the URL path or set it to 'index.html' as default
filename = `${filename}.html`;
let path = DialogKeywords.getGBAIPath(min.botId, `gbot`);
const directoryPath = Path.join(process.env.PWD, 'work', path, 'Website');
const filePath = Path.join(directoryPath, filename);
GBLogEx.info(min, `[GBDeployer] Saving Website file in ${filePath}.`);
Fs.mkdirSync(directoryPath, { recursive: true }); // Create directory recursively if it doesn't exist
Fs.writeFileSync(filePath, buffer);
return filePath;
}
}
return null;
}
async crawl(min, url: string, visited: Set<string>, depth: number, maxDepth: number, page: Page): Promise<string[]> {
try {
if (
depth > maxDepth ||
(visited.has(url) ||
url.endsWith('.jpg') ||
url.endsWith('.pdf') ||
url.endsWith('.jpg') ||
url.endsWith('.png') ||
url.endsWith('.mp4'))
) {
return [];
}
await GBLogEx.info(min, `Processing URL: ${url}.`);
visited.add(url);
const filename = await this.saveHtmlPage(min, url, page);
if (!filename) {
// If the URL doesn't represent an HTML page, skip crawling its links
return [];
}
const currentDomain = new URL(page.url()).hostname;
let links = await page.evaluate(currentDomain => {
const anchors = Array.from(document.querySelectorAll('a')).filter(p => {
try {
return currentDomain == new URL(p.href).hostname;
} catch (err) {
return false;
}
});
return anchors.map(anchor => {
return anchor.href.replace(/#.*/, '');
});
}, currentDomain);
if (!Array.isArray(links)) {
links = [];
}
let filteredLinks = [];
if (links && typeof links[Symbol.iterator] === 'function') {
filteredLinks = links.filter(l => {
try {
new URL(l); // Check if the link is a valid URL
return !visited.has(l);
} catch (error) {
// Ignore invalid URLs
return false;
}
});
}
const childLinks = [];
for (const link of filteredLinks) {
const links = await this.crawl(min, link, visited, depth + 1, maxDepth, page);
if (links){
childLinks.push(...links);
}
}
return [filename, ...childLinks]; // Include the filename of the cached file
} catch (error) {
await GBLogEx.info(min, error);
}
}
/**
* Import all .docx files in reading comprehension folder.
*/
@ -875,11 +958,52 @@ export class KBService implements IGBKBService {
instance: IGBInstance,
packageId: number
): Promise<any> {
const files = await walkPromise(urlJoin(localPath, 'docs'));
let files = [];
const website = min.core.getParam<string>(min.instance, 'Website', null);
if (website) {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
const response = await page.goto(website);
await page.screenshot({ path: 'screenshot.png' });
// Extract dominant colors from the screenshot
const colors = await getColors('screenshot.png');
// Assuming you want the two most dominant colors
const mainColor1 = colors[0].hex();
const mainColor2 = colors[1].hex();
console.log('Main Color 1:', mainColor1);
console.log('Main Color 2:', mainColor2);
const maxDepth = 1; // Maximum depth of recursion
const visited = new Set<string>();
files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page));
await browser.close();
files.shift();
await CollectionUtil.asyncForEach(files, async file => {
let content = null;
const document = await this.loadAndSplitFile(file);
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
const vectorStore = min['vectorStore'];
await vectorStore.addDocuments(flattenedDocuments);
await vectorStore.save(min['vectorStorePath']);
});
}
files = await walkPromise(urlJoin(localPath, 'docs'));
if (!files[0]) {
GBLogEx.info(min,
`[GBDeployer] docs folder not created yet in .gbkb. To use Reading Comprehension, create this folder at root and put a document to get read by the.`
);
GBLogEx.info(min, `[GBDeployer] docs folder not created yet in .gbkb neither a website in .gbot.`);
} else {
await CollectionUtil.asyncForEach(files, async file => {
let content = null;
@ -896,15 +1020,14 @@ export class KBService implements IGBKBService {
defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 700,
chunkOverlap: 50,
chunkOverlap: 50
});
markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', {
chunkSize: 700,
chunkOverlap: 50,
chunkOverlap: 50
});
private async loadAndSplitFile(filePath: string): Promise<Document<Record<string, unknown>>[]> {
const fileExtension = path.extname(filePath);
let loader;
@ -918,6 +1041,14 @@ export class KBService implements IGBKBService {
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.txt':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.html':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.md':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.markdownRecursiveCharacterTextSplitter);
@ -944,7 +1075,6 @@ export class KBService implements IGBKBService {
return documents;
}
public async importKbTabularDirectory(localPath: string, min: GBMinInstance, packageId: number): Promise<any> {
const files = await walkPromise(localPath);