new(basic.gblib): GPT replacing ALLEN NLP Reading Comp.

This commit is contained in:
Rodrigo Rodriguez 2024-01-29 21:04:53 -03:00
parent 4d8061db60
commit ff6adacf9b
4 changed files with 375 additions and 259 deletions

View file

@ -309,6 +309,21 @@ export class GBDeployer implements IGBDeployer {
return await this.core.saveInstance(instance); return await this.core.saveInstance(instance);
} }
public async loadOrCreateEmptyVectorStore(min: GBMinInstance): Promise<HNSWLib> {
let vectorStore: HNSWLib;
try {
vectorStore = await HNSWLib.load(min['vectorStorePath'], new OpenAIEmbeddings({ maxConcurrency: 5 }));
} catch {
vectorStore = new HNSWLib(new OpenAIEmbeddings({ maxConcurrency: 5 }), {
space: 'cosine',
numDimensions: 1536,
});
}
return vectorStore;
}
/** /**
* Performs the NLP publishing process on remote service. * Performs the NLP publishing process on remote service.
*/ */

View file

@ -312,6 +312,10 @@ export class GBMinService {
if (!Fs.existsSync(dir)) { if (!Fs.existsSync(dir)) {
mkdirp.sync(dir); mkdirp.sync(dir);
} }
dir = `work/${gbai}/${botId}.gbkb/docs-vectorized`;
if (!Fs.existsSync(dir)) {
mkdirp.sync(dir);
}
dir = `work/${gbai}/${botId}.gbdialog`; dir = `work/${gbai}/${botId}.gbdialog`;
if (!Fs.existsSync(dir)) { if (!Fs.existsSync(dir)) {
mkdirp.sync(dir); mkdirp.sync(dir);
@ -746,6 +750,9 @@ export class GBMinService {
min.sandBoxMap = {}; min.sandBoxMap = {};
min['scheduleMap'] = {}; min['scheduleMap'] = {};
min['conversationWelcomed'] = {}; min['conversationWelcomed'] = {};
min['vectorStore']= await this.deployer.loadOrCreateEmptyVectorStore(min);
const gbkbPath = DialogKeywords.getGBAIPath(min.botId, 'gbkb');
min['vectorStorePath']= Path.join('work', gbkbPath, 'docs-vectorized');
min.packages = sysPackages; min.packages = sysPackages;
// NLP Manager. // NLP Manager.

View file

@ -31,7 +31,7 @@
'use strict'; 'use strict';
import { GBMinInstance } from 'botlib'; import { GBMinInstance } from 'botlib';
//import OpenAI from "openai"; import OpenAI from "openai";
import { ChatGPTAPIBrowser, getOpenAIAuth } from 'chatgpt' import { ChatGPTAPIBrowser, getOpenAIAuth } from 'chatgpt'
import { CollectionUtil } from 'pragmatismo-io-framework'; import { CollectionUtil } from 'pragmatismo-io-framework';
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js'; import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
@ -69,15 +69,15 @@ export class ChatServices {
// Calls Model. // Calls Model.
// const openai = new OpenAI({ const openai = new OpenAI({
// apiKey: key apiKey: key
// }); });
// const chatCompletion = await openai.chat.completions.create({ const chatCompletion = await openai.chat.completions.create({
// model: "gpt-3.5-turbo", model: "gpt-3.5-turbo",
// messages: [{ role: "user", content: text }], messages: [{ role: "user", content: text }],
// functions: functions functions: functions
// }); });
// return chatCompletion.choices[0].message.content; return chatCompletion.choices[0].message.content;
} }

View file

@ -35,13 +35,24 @@
import Path from 'path'; import Path from 'path';
import Fs from 'fs'; import Fs from 'fs';
import urlJoin from 'url-join'; import urlJoin from 'url-join';
import path from 'path';
import asyncPromise from 'async-promises'; import asyncPromise from 'async-promises';
import walkPromise from 'walk-promise'; import walkPromise from 'walk-promise';
import { SearchClient } from '@azure/search-documents'; import { SearchClient } from '@azure/search-documents';
import Excel from 'exceljs'; import Excel from 'exceljs';
import getSlug from 'speakingurl'; import getSlug from 'speakingurl';
import { GBServer } from '../../../src/app.js'; import { GBServer } from '../../../src/app.js';
import { HNSWLib } from 'langchain/vectorstores/hnswlib';
import { JSONLoader } from 'langchain/document_loaders/fs/json';
import { TextLoader } from 'langchain/document_loaders/fs/text';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { DocxLoader } from 'langchain/document_loaders/fs/docx';
import { EPubLoader } from 'langchain/document_loaders/fs/epub';
import { CSVLoader } from 'langchain/document_loaders/fs/csv';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from 'langchain/document';
import path from 'path';
import { YoutubeTranscript } from 'youtube-transcript';
import { import {
GBDialogStep, GBDialogStep,
GBLog, GBLog,
@ -257,7 +268,7 @@ export class KBService implements IGBKBService {
} }
public async ask( public async ask(
instance: IGBInstance, min: GBMinInstance,
query: string, query: string,
searchScore: number, searchScore: number,
subjects: GuaribasSubject[] subjects: GuaribasSubject[]
@ -272,6 +283,8 @@ export class KBService implements IGBKBService {
query = query.replace('\\', ' '); query = query.replace('\\', ' ');
query = query.replace('\r\n', ' '); query = query.replace('\r\n', ' ');
const instance = min.instance;
// Try simple search first. // Try simple search first.
const data = await this.getAnswerByText(instance.instanceId, query.trim()); const data = await this.getAnswerByText(instance.instanceId, query.trim());
@ -341,12 +354,17 @@ export class KBService implements IGBKBService {
return { answer: undefined, questionId: 0 }; return { answer: undefined, questionId: 0 };
} }
} else { } else {
GBLog.info( GBLog.info(
`SEARCH called but returned LOW level score, `SEARCH called but returned LOW level score,
returnedScore: ${returnedScore} < required (searchScore): ${searchScore}` returnedScore: ${returnedScore} < required (searchScore): ${searchScore}`
); );
return { answer: undefined, questionId: 0 }; return await this.answerByGPT(min,
query,
searchScore,
subjects
);
} }
} }
@ -354,6 +372,44 @@ export class KBService implements IGBKBService {
} }
} }
private async getRelevantContext(
vectorStore: HNSWLib,
sanitizedQuestion: string,
numDocuments: number
): Promise<string> {
const documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments);
return documents
.map((doc) => doc.pageContent)
.join(', ')
.trim()
.replaceAll('\n', ' ');
}
public async answerByGPT(min: GBMinInstance,
query: string,
searchScore: number,
subjects: GuaribasSubject[]
) {
const contextVectorStore = min['VectorStore'];
const question = query.trim().replaceAll('\n', ' ');
const context = await this.getRelevantContext(contextVectorStore, question, 1);
const response = await chain.call({
input: question,
context,
history: '',
immediate_history: '',
});
if (response) {
return { answer: response.response, questionId: 0 };
}
return { answer: undefined, questionId: 0 };
}
public async getSubjectItems(instanceId: number, parentId: number): Promise<GuaribasSubject[]> { public async getSubjectItems(instanceId: number, parentId: number): Promise<GuaribasSubject[]> {
const where = { parentSubjectId: parentId, instanceId: instanceId }; const where = { parentSubjectId: parentId, instanceId: instanceId };
@ -830,29 +886,67 @@ export class KBService implements IGBKBService {
await CollectionUtil.asyncForEach(files, async file => { await CollectionUtil.asyncForEach(files, async file => {
let content = null; let content = null;
let filePath = Path.join(file.root, file.name); let filePath = Path.join(file.root, file.name);
if (file !== null) {
if (file.name.endsWith('.docx')) { const document = await this.loadAndSplitFile(filePath);
content = await this.getTextFromFile(filePath); const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
} else if (file.name.endsWith('.pdf')) { const vectorStore = min['vectorStore'];
const read = await pdf(Fs.readFileSync(filePath)); await vectorStore.addDocuments(flattenedDocuments);
content = read.text; await vectorStore.save(min['vectorStorePath']);
});
} }
} }
if (content) { defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter({
content = await min.conversationalService.translate(min, content, 'en'); chunkSize: 1024,
await GuaribasAnswer.create(<GuaribasAnswer>{ chunkOverlap: 1024,
instanceId: instance.instanceId,
content: content,
format: '.docx',
media: file.name,
packageId: packageId
}); });
}
markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', {
chunkSize: 1024,
chunkOverlap: 1024,
}); });
private async loadAndSplitFile(filePath: string): Promise<Document<Record<string, unknown>>[]> {
const fileExtension = path.extname(filePath);
let loader;
let documents: Document<Record<string, unknown>>[];
switch (fileExtension) {
case '.json':
loader = new JSONLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.txt':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.md':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.markdownRecursiveCharacterTextSplitter);
break;
case '.pdf':
loader = new PDFLoader(filePath, { splitPages: false });
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.docx':
loader = new DocxLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.csv':
loader = new CSVLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.epub':
loader = new EPubLoader(filePath, { splitChapters: false });
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
default:
throw new Error(`Unsupported file extension: ${fileExtension}`);
} }
return documents;
} }
public async importKbTabularDirectory(localPath: string, min: GBMinInstance, packageId: number): Promise < any > { public async importKbTabularDirectory(localPath: string, min: GBMinInstance, packageId: number): Promise < any > {
const files = await walkPromise(localPath); const files = await walkPromise(localPath);