botserver/packages/kb.gbapp/services/KBService.ts

1258 lines
41 KiB
TypeScript
Raw Normal View History

2018-04-21 02:59:30 -03:00
/*****************************************************************************\
2024-01-09 17:40:48 -03:00
| ® |
| |
| |
| |
| |
2018-04-21 02:59:30 -03:00
| |
| General Bots Copyright (c) pragmatismo.com.br. All rights reserved. |
2018-04-21 02:59:30 -03:00
| Licensed under the AGPL-3.0. |
2018-11-11 19:09:18 -02:00
| |
2018-04-21 02:59:30 -03:00
| According to our dual licensing model, this program can be used either |
| under the terms of the GNU Affero General Public License, version 3, |
| or under a proprietary license. |
| |
| The texts of the GNU Affero General Public License with an additional |
| permission and of our proprietary license can be found at and |
| in the LICENSE file you have received along with this program. |
| |
| This program is distributed in the hope that it will be useful, |
2018-09-11 19:40:53 -03:00
| but WITHOUT ANY WARRANTY, without even the implied warranty of |
2018-04-21 02:59:30 -03:00
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU Affero General Public License for more details. |
| |
2024-01-10 14:52:01 -03:00
| "General Bots" is a registered trademark of pragmatismo.com.br. |
2018-04-21 02:59:30 -03:00
| The licensing of the program under the AGPLv3 does not imply a |
| trademark license. Therefore any rights, title and interest in |
| our trademarks remain entirely with us. |
| |
\*****************************************************************************/
2018-11-27 22:56:11 -02:00
/**
* @fileoverview Knowledge base services and logic.
*/
import Path from 'path';
import Fs from 'fs';
import { OpenAIChat } from 'langchain/llms/openai';
import { CallbackManager } from 'langchain/callbacks';
import urlJoin from 'url-join';
import asyncPromise from 'async-promises';
import walkPromise from 'walk-promise';
import { SearchClient } from '@azure/search-documents';
import Excel from 'exceljs';
import getSlug from 'speakingurl';
import { ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate } from 'langchain/prompts';
import { LLMChain } from 'langchain/chains';
import { GBServer } from '../../../src/app.js';
import { HNSWLib } from 'langchain/vectorstores/hnswlib';
import { JSONLoader } from 'langchain/document_loaders/fs/json';
import { TextLoader } from 'langchain/document_loaders/fs/text';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { DocxLoader } from 'langchain/document_loaders/fs/docx';
import { EPubLoader } from 'langchain/document_loaders/fs/epub';
import { CSVLoader } from 'langchain/document_loaders/fs/csv';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { BufferWindowMemory } from 'langchain/memory';
import { Document } from 'langchain/document';
import path from 'path';
2020-08-26 17:50:42 -03:00
import {
GBDialogStep,
GBLog,
2020-12-31 15:36:19 -03:00
GBMinInstance,
2020-08-26 17:50:42 -03:00
IGBConversationalService,
IGBCoreService,
IGBInstance,
2020-12-31 15:36:19 -03:00
IGBKBService
2020-08-26 17:50:42 -03:00
} from 'botlib';
2020-12-31 15:36:19 -03:00
import { CollectionUtil } from 'pragmatismo-io-framework';
import { Op } from 'sequelize';
import { Sequelize } from 'sequelize-typescript';
import { AzureDeployerService } from '../../azuredeployer.gbapp/services/AzureDeployerService.js';
import { GuaribasPackage } from '../../core.gbapp/models/GBModel.js';
import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js';
import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js';
import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js';
import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js';
import { parse } from 'node-html-parser';
import textract from 'textract';
import pdf from 'pdf-extraction';
import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js';
import mammoth from 'mammoth';
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
2023-07-09 14:51:46 -03:00
import { GBMinService } from '../../core.gbapp/services/GBMinService.js';
2018-04-21 02:59:30 -03:00
/**
* Result for quey on KB data.
*/
2018-08-28 19:16:29 -03:00
export class KBServiceSearchResults {
public answer: GuaribasAnswer;
public questionId: number;
2018-08-28 19:16:29 -03:00
}
/**
* All services related to knowledge base management.
*/
export class KBService implements IGBKBService {
public sequelize: Sequelize;
2018-09-10 16:24:32 -03:00
constructor(sequelize: Sequelize) {
this.sequelize = sequelize;
2018-09-10 16:24:32 -03:00
}
public static getFormattedSubjectItems(subjects: GuaribasSubject[]) {
if (subjects === null) {
return '';
}
const out = [];
subjects.forEach(subject => {
out.push(subject.title);
});
2018-11-27 22:56:11 -02:00
return out.join(', ');
}
public static getSubjectItemsSeparatedBySpaces(subjects: GuaribasSubject[]) {
const out = [];
2020-08-26 17:50:42 -03:00
if (subjects === undefined) {
return '';
}
subjects.forEach(subject => {
out.push(subject.internalId);
});
2018-11-27 22:56:11 -02:00
return out.join(' ');
}
public async getAnswerTextByMediaName(instanceId: number, answerMediaName: string): Promise<string> {
const answer = await GuaribasAnswer.findOne({
where: {
instanceId: instanceId,
media: answerMediaName
}
});
2020-12-31 15:36:19 -03:00
return answer != undefined ? answer.content : null;
}
public async getQuestionById(instanceId: number, questionId: number): Promise<GuaribasQuestion> {
return GuaribasQuestion.findOne({
where: {
instanceId: instanceId,
questionId: questionId
}
});
}
public async getAnswerById(instanceId: number, answerId: number): Promise<GuaribasAnswer> {
return await GuaribasAnswer.findOne({
where: {
instanceId: instanceId,
answerId: answerId
}
});
2018-04-21 02:59:30 -03:00
}
/**
* Returns a question object given a SEO friendly URL.
*/
public async getQuestionIdFromURL(core: IGBCoreService, url: string) {
// Extracts questionId from URL.
const id = url.substr(url.lastIndexOf('-') + 1);
// Extracts botId from URL.
let path = /(http[s]?:\/\/)?([^\/\s]+\/)(.*)/gi;
const botId = url.replace(path, ($0, $1, $2, $3) => {
return $3.substr($3.indexOf('/'));
});
// Finds the associated question.
const instance = await core.loadInstanceByBotId(botId);
const question = await GuaribasQuestion.findAll({
where: {
instanceId: instance.instanceId,
questionId: id
}
});
return question;
}
public static async getQuestionsNER(instanceId: number) {
const where = {
instanceId: instanceId,
content: { [Op.like]: `%(%` }
};
const questions = await GuaribasQuestion.findAll({
where: where
});
return questions;
}
public async getQuestionsSEO(instanceId: number) {
const questions = await GuaribasQuestion.findAll({
where: {
instanceId: instanceId
}
});
let output = [];
for (let i = 0; i < questions.length; i++) {
const answer = questions[i];
const text = getSlug(answer.content);
let url = `${text}-${i}`;
output.push(url);
}
return output;
}
public async getDocs(instanceId: number) {
return await GuaribasAnswer.findAll({
where: {
instanceId: instanceId,
format: '.docx'
}
});
}
public async getAnswerByText(instanceId: number, text: string, from: string = null): Promise<any> {
text = text.trim();
const service = new CSService();
let question = await service.getQuestionFromAlternateText(instanceId, text);
if (!question) {
const where = {
instanceId: instanceId,
content: { [Op.like]: `%^[\w.]+${text}^[\w.]+%` }
};
if (from) {
where['from'] = from;
}
question = await GuaribasQuestion.findOne({
where: where
});
}
if (!question) {
let where = {
instanceId: instanceId,
content: { [Op.eq]: `${text}` }
};
question = await GuaribasQuestion.findOne({
where: where
});
}
2018-09-10 16:24:32 -03:00
if (question !== null) {
const answer = await GuaribasAnswer.findOne({
2018-09-10 16:24:32 -03:00
where: {
instanceId: instanceId,
answerId: question.answerId
}
});
2018-11-27 22:56:11 -02:00
2020-08-19 13:00:21 -03:00
return { question: question, answer: answer };
2018-09-10 16:24:32 -03:00
}
2018-11-27 22:56:11 -02:00
2020-08-19 13:00:21 -03:00
return undefined;
2018-04-21 02:59:30 -03:00
}
public async addAnswer(obj: GuaribasAnswer): Promise<GuaribasAnswer> {
2019-10-16 23:22:21 -03:00
return await GuaribasAnswer.create(obj);
2018-04-21 02:59:30 -03:00
}
public async ask(
min: GBMinInstance,
2018-09-10 16:24:32 -03:00
query: string,
2018-04-21 02:59:30 -03:00
searchScore: number,
2018-08-28 19:16:29 -03:00
subjects: GuaribasSubject[]
): Promise<KBServiceSearchResults> {
2018-09-09 14:39:37 -03:00
// Builds search query.
2018-08-28 19:16:29 -03:00
query = query.toLowerCase();
query = query.replace('?', ' ');
query = query.replace('!', ' ');
query = query.replace('.', ' ');
query = query.replace('/', ' ');
query = query.replace('\\', ' ');
query = query.replace('\r\n', ' ');
2018-08-28 19:16:29 -03:00
const instance = min.instance;
// Try simple search first.
const data = await this.getAnswerByText(instance.instanceId, query.trim());
if (data) {
GBLog.info(`Simple SEARCH called.`);
return { answer: data.answer, questionId: data.question.questionId };
}
if (subjects !== null) {
const text = KBService.getSubjectItemsSeparatedBySpaces(subjects);
if (text !== null) {
query = `${query} ${text}`;
}
2018-09-09 14:39:37 -03:00
}
const key = instance.searchKey ? instance.searchKey :
GBServer.globals.minBoot.instance.searchKey;
const host = instance.searchHost ? instance.searchHost :
GBServer.globals.minBoot.instance.searchHost;
// No direct match found, so Search is used.
if (key !== null && GBConfigService.get('STORAGE_DIALECT') === 'mssql') {
interface SearchResults {
instanceId: number;
questionId: number;
answerId: number;
content: string;
subject1: string;
subject2: string;
2022-12-15 23:03:20 -03:00
subject3: string;
subject4: string;
}
const client = new SearchClient<any>('https://' + host, 'azuresql-index', {
key: key
} as any);
const results = await client.search(query.substring(0, 499), {
filter: `instanceId eq ${instance.instanceId} and skipIndex eq false`,
2022-12-15 23:03:20 -03:00
searchFields: ['content', 'subject1', 'subject2', 'subject3', 'subject4'],
select: ['instanceId', 'questionId', 'answerId'],
skip: 0,
top: 1
});
let returnedScore = 0;
// Searches via Search (Azure Search).
let found = false;
for await (const result of results.results) {
found = true;
returnedScore = result.score;
if (returnedScore >= searchScore) {
const value = await this.getAnswerById(instance.instanceId, result.document.answerId);
if (value !== null) {
GBLog.info(`SEARCH WILL BE USED with score: ${returnedScore} > required (searchScore): ${searchScore}`);
2020-12-31 15:36:19 -03:00
return { answer: value, questionId: result.document.questionId };
} else {
GBLog.info(
`Index problem. SEARCH WILL NOT be used as answerId ${result.document.answerId} was not found in database,
returnedScore: ${returnedScore} < required (searchScore): ${searchScore}`
);
return { answer: undefined, questionId: 0 };
}
}
}
2020-12-31 15:36:19 -03:00
GBLog.info(
`SEARCH returned LOW level score, calling GPT
returnedScore: ${returnedScore} < required (searchScore): ${searchScore}`
);
return await this.answerByGPT(min,
query,
searchScore,
subjects
);
}
2018-04-21 02:59:30 -03:00
}
private async getRelevantContext(
vectorStore: HNSWLib,
sanitizedQuestion: string,
numDocuments: number
): Promise<string> {
const documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments);
return documents
.map((doc) => doc.pageContent)
.join(', ')
.trim()
.replaceAll('\n', ' ');
}
public async answerByGPT(min: GBMinInstance,
query: string,
searchScore: number,
subjects: GuaribasSubject[]
) {
const contextVectorStore = min['vectorStore'];
const question = query.trim().replaceAll('\n', ' ');
const context = await this.getRelevantContext(contextVectorStore, question, 1);
const systemPrompt = SystemMessagePromptTemplate.fromTemplate(
"You are General Bots");
const chatPrompt = ChatPromptTemplate.fromPromptMessages([
systemPrompt,
HumanMessagePromptTemplate.fromTemplate(`Answer in pt-br.
You have access to the context (RELEVANTDOCS) provided by the user.
When answering think about whether the question in RELEVANTDOCS, but never mention
to user about the source.
Dont justify your answers. Don't refer to yourself in any of the created content.
Don´t prefix RESPONSE: when answering the user.
RELEVANTDOCS: {context}
QUESTION: """{input}"""
`),
]);
const windowMemory = new BufferWindowMemory({
returnMessages: false,
memoryKey: 'immediate_history',
inputKey: 'input',
k: 2,
});
const callbackManager = CallbackManager.fromHandlers({
// This function is called when the LLM generates a new token (i.e., a prediction for the next word)
async handleLLMNewToken(token: string) {
},
});
const llm = new OpenAIChat({
streaming: true,
callbackManager,
modelName: 'gpt-3.5-turbo',
});
const chain = new LLMChain({
prompt: chatPrompt,
memory: windowMemory,
llm,
});
const response = await chain.call({
input: question,
context,
history: '',
immediate_history: '',
});
if (response) {
return { answer: response.text, questionId: 0 };
}
return { answer: undefined, questionId: 0 };
}
public async getSubjectItems(instanceId: number, parentId: number): Promise<GuaribasSubject[]> {
const where = { parentSubjectId: parentId, instanceId: instanceId };
2018-11-27 22:56:11 -02:00
return GuaribasSubject.findAll({
where: where
});
2018-04-21 02:59:30 -03:00
}
public async getFaqBySubjectArray(instanceId: number, from: string, subjects: any): Promise<GuaribasQuestion[]> {
if (subjects) {
const where = {
from: from,
// tslint:disable-next-line: no-null-keyword
subject1: null,
// tslint:disable-next-line: no-null-keyword
subject2: null,
// tslint:disable-next-line: no-null-keyword
subject3: null,
// tslint:disable-next-line: no-null-keyword
subject4: null,
2021-02-28 12:26:44 -03:00
// tslint:disable-next-line: no-null-keyword
instanceId: instanceId
};
if (subjects[0] && subjects[0].internalId) {
where.subject1 = subjects[0].internalId;
}
2018-04-21 02:59:30 -03:00
if (subjects[1] && subjects[1].internalId) {
where.subject2 = subjects[1].internalId;
}
2018-04-21 02:59:30 -03:00
if (subjects[2] && subjects[2].internalId) {
where.subject3 = subjects[2].internalId;
}
2018-09-09 14:39:37 -03:00
if (subjects[3] && subjects[3].internalId) {
where.subject4 = subjects[3].internalId;
}
2018-11-27 22:56:11 -02:00
return await GuaribasQuestion.findAll({
where: where
});
} else {
return await GuaribasQuestion.findAll({
2021-02-28 13:06:05 -03:00
where: { from: from, instanceId: instanceId }
});
}
2018-04-21 02:59:30 -03:00
}
public static async getGroupReplies(instanceId: number): Promise<GuaribasQuestion[]> {
return await GuaribasQuestion.findAll({
where: { from: 'group', instanceId: instanceId }
});
}
public async importKbTabularFile(
2018-09-09 14:39:37 -03:00
filePath: string,
min: GBMinInstance,
2018-09-09 14:39:37 -03:00
packageId: number
): Promise<GuaribasQuestion[]> {
GBLog.info(`Now reading file ${filePath}...`);
2020-12-31 15:36:19 -03:00
const workbook = new Excel.Workbook();
const data = await workbook.xlsx.readFile(filePath);
let lastQuestionId: number;
let lastAnswer: GuaribasAnswer;
2020-08-26 17:50:42 -03:00
// Finds a valid worksheet because Excel returns empty slots
// when loading worksheets collection.
2020-08-26 17:50:42 -03:00
let worksheet: any;
for (let t = 0; t < data.worksheets.length; t++) {
worksheet = data.worksheets[t];
if (worksheet) {
2020-08-26 17:50:42 -03:00
break;
}
}
2020-12-31 15:36:19 -03:00
const rows = worksheet._rows;
const answers = [];
const questions = [];
GBLog.info(`Processing ${rows.length} rows from tabular file ${filePath}...`);
await asyncPromise.eachSeries(rows, async line => {
// Skips the first line.
2020-08-26 17:50:42 -03:00
if (
line != undefined &&
line._cells[0] !== undefined &&
line._cells[1] !== undefined &&
line._cells[2] !== undefined &&
line._cells[3] !== undefined &&
2020-08-26 17:50:42 -03:00
line._cells[4] !== undefined
) {
// Extracts values from columns in the current line.
const subjectsText = line._cells[0].text;
const from = line._cells[1].text;
const to = line._cells[2].text;
const question = line._cells[3].text.trim();
let answer = line._cells[4].text.trim();
if (
!(subjectsText === 'subjects' && from === 'from') &&
answer !== null &&
question !== null &&
answer !== '' &&
question !== ''
) {
let format = '.txt';
2018-09-09 18:11:41 -03:00
// Extracts answer from external media if any.
let media = null;
2020-08-26 17:50:42 -03:00
if (typeof answer !== 'string') {
GBLog.info(`[GBImporter] Answer is NULL related to Question '${question}'.`);
2020-08-26 17:50:42 -03:00
answer =
'Existe um problema na base de conhecimento. Fui treinado para entender sua pergunta, avise a quem me criou que a resposta não foi informada para esta pergunta.';
} else if (answer.indexOf('.md') > -1 || answer.indexOf('.docx') > -1) {
const mediaFilename = urlJoin(path.dirname(filePath), '..', 'articles', answer);
if (Fs.existsSync(mediaFilename)) {
// Tries to load .docx file from Articles folder.
if (answer.indexOf('.docx') > -1) {
answer = await this.getTextFromFile(filePath);
} else {
// Loads normally markdown file.
answer = Fs.readFileSync(mediaFilename, 'utf8');
}
format = '.md';
media = path.basename(mediaFilename);
} else {
if (answer.indexOf('.md') > -1) {
GBLog.info(`[GBImporter] File not found: ${mediaFilename}.`);
answer = '';
}
}
}
2018-09-10 12:09:48 -03:00
// Processes subjects hierarchy splitting by dots.
const subjectArray = subjectsText.split('.');
let subject1: string;
let subject2: string;
let subject3: string;
let subject4: string;
let indexer = 0;
subjectArray.forEach(element => {
if (indexer === 0) {
subject1 = subjectArray[indexer].substring(0, 63);
} else if (indexer === 1) {
subject2 = subjectArray[indexer].substring(0, 63);
} else if (indexer === 2) {
subject3 = subjectArray[indexer].substring(0, 63);
} else if (indexer === 3) {
subject4 = subjectArray[indexer].substring(0, 63);
}
indexer++;
});
// Skips blank answers.
if (answer && answer.trim() === '') {
return false;
}
// In case of code cell, compiles it and associate with the answer.
answer = GBVMService.normalizeQuotes(answer);
const isBasic = answer.toLowerCase().startsWith('/basic');
if (/TALK\s*\".*\"/gi.test(answer) || isBasic) {
const code = isBasic ? answer.substr(6) : answer;
const path = DialogKeywords.getGBAIPath(min.botId, `gbdialog`);
const scriptName = `tmp${GBAdminService.getRndReadableIdentifier()}.docx`;
const localName = Path.join('work', path, `${scriptName}`);
Fs.writeFileSync(localName, code, { encoding: null });
answer = scriptName;
const vm = new GBVMService();
await vm.loadDialog(Path.basename(localName), Path.dirname(localName), min);
}
// Now with all the data ready, creates entities in the store.
const answer1 = {
instanceId: min.instance.instanceId,
content: answer,
format: format,
media: media,
packageId: packageId,
prevId: lastQuestionId !== null ? lastQuestionId : 0
};
answers.push(answer1);
const question1 = {
from: from,
to: to,
subject1: subject1,
subject2: subject2,
subject3: subject3,
subject4: subject4,
content: question.replace(/["]+/g, ''),
instanceId: min.instance.instanceId,
skipIndex: question.charAt(0) === '"',
packageId: packageId
};
questions.push(question1);
2022-12-26 13:37:11 -03:00
// https://github.com/GeneralBots/BotServer/issues/312
// if (lastAnswer !== undefined && lastQuestionId !== 0) {
// await lastAnswer.update({ nextId: lastQuestionId });
// }
// lastAnswer = answer1;
// lastQuestionId = question1.questionId;
return true;
} else {
// Skips the header.
2020-08-19 13:00:21 -03:00
return undefined;
}
}
});
const answersCreated = await GuaribasAnswer.bulkCreate(answers);
let i = 0;
await CollectionUtil.asyncForEach(questions, async question => {
question.answerId = answersCreated[i++].answerId;
});
2020-12-31 15:36:19 -03:00
return await GuaribasQuestion.bulkCreate(questions);
2018-04-21 02:59:30 -03:00
}
public async sendAnswer(min: GBMinInstance, channel: string, step: GBDialogStep, answer) {
answer = typeof (answer) === 'string' ? answer : answer.content;
if (answer.endsWith('.mp4')) {
await this.playVideo(min, min.conversationalService, step, answer, channel);
} else if (
answer.endsWith('.ppt') ||
answer.endsWith('.pptx') ||
answer.endsWith('.doc') ||
answer.endsWith('.docx') ||
answer.endsWith('.xls') ||
answer.endsWith('.xlsx')
) {
const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`);
const doc = urlJoin(GBServer.globals.publicAddress, 'kb', path, 'assets', answer);
const url = `http://view.officeapps.live.com/op/view.aspx?src=${doc}`;
await this.playUrl(min, min.conversationalService, step, url, channel);
} else if (answer.endsWith('.pdf')) {
const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`);
const url = urlJoin('kb', path, 'assets', answer);
await this.playUrl(min, min.conversationalService, step, url, channel);
2020-08-26 17:50:42 -03:00
} else if (answer.format === '.md') {
await min.conversationalService['playMarkdown'](min, answer, channel, step,
2023-07-09 14:51:46 -03:00
GBMinService.userMobile(step));
} else if (answer.endsWith('.ogg') && process.env.AUDIO_DISABLED !== 'true') {
await this.playAudio(min, answer, channel, step, min.conversationalService);
} else {
await min.conversationalService.sendText(min, step, answer);
await min.conversationalService.sendEvent(min, step, 'stop', undefined);
}
}
public async addQA(min, questionText, answerText) {
const pkg = await GuaribasPackage.findOne({
where: { instanceId: min.instance.instanceId }
});
const question = {
from: 'autodialog',
to: '',
subject1: '',
subject2: '',
subject3: '',
subject4: '',
content: questionText.replace(/["]+/g, ''),
instanceId: min.instance.instanceId,
skipIndex: false,
packageId: pkg.packageId
};
const answer = {
instanceId: min.instance.instanceId,
content: answerText,
format: '.txt',
media: null,
packageId: pkg.packageId,
prevId: 0
};
const a = await GuaribasAnswer.create(answer);
question['answerId'] = a.answerId;
const q = await GuaribasQuestion.create(question);
}
public async importKbPackage(
min: GBMinInstance,
localPath: string,
packageStorage: GuaribasPackage,
instance: IGBInstance
): Promise<any> {
// Imports subjects tree into database and return it.
2018-09-09 14:39:37 -03:00
const subjectFile = urlJoin(localPath, 'subjects.json');
const menuFile = urlJoin(localPath, 'menu.xlsx');
// Imports menu.xlsx if any.
if (Fs.existsSync(subjectFile) || Fs.existsSync(menuFile)) {
await this.importSubjectFile(packageStorage.packageId, subjectFile, menuFile, instance);
}
2018-09-09 14:39:37 -03:00
// Import tabular files in the tabular directory.
await this.importKbTabularDirectory(localPath, min, packageStorage.packageId);
2018-04-21 02:59:30 -03:00
// Import remaining .md files in articles directory.
await this.importRemainingArticles(localPath, instance, packageStorage.packageId);
// Import docs files in .docx directory.
return await this.importDocs(min, localPath, instance, packageStorage.packageId);
}
2018-09-09 18:11:41 -03:00
/**
* Import all .md files in articles folder that has not been referenced by tabular files.
*/
public async importRemainingArticles(localPath: string, instance: IGBInstance, packageId: number): Promise<any> {
const files = await walkPromise(urlJoin(localPath, 'articles'));
const data = { questions: [], answers: [] };
await CollectionUtil.asyncForEach(files, async file => {
if (file !== null && file.name.endsWith('.md')) {
let content = await this.getAnswerTextByMediaName(instance.instanceId, file.name);
if (content === null) {
const fullFilename = urlJoin(file.root, file.name);
content = Fs.readFileSync(fullFilename, 'utf-8');
2022-01-03 13:11:21 -03:00
await GuaribasAnswer.create(<GuaribasAnswer>{
instanceId: instance.instanceId,
content: content,
2020-08-26 17:50:42 -03:00
format: '.md',
media: file.name,
packageId: packageId,
2022-12-26 13:37:11 -03:00
prevId: 0 // https://github.com/GeneralBots/BotServer/issues/312
});
}
} else if (file !== null && file.name.endsWith('.docx')) {
const path = DialogKeywords.getGBAIPath(instance.botId, `gbkb`);
const localName = Path.join('work', path, 'articles', file.name);
const buffer = Fs.readFileSync(localName, { encoding: null });
var options = {
buffer: buffer,
convertImage: async image => {
const localName = Path.join(
'work',
DialogKeywords.getGBAIPath(instance.botId),
'cache',
`img-docx${GBAdminService.getRndReadableIdentifier()}.png`
);
const url = urlJoin(
GBServer.globals.publicAddress,
DialogKeywords.getGBAIPath(instance.botId).replace(/\.[^/.]+$/, ''),
'cache',
Path.basename(localName)
);
const buffer = await image.read();
Fs.writeFileSync(localName, buffer, { encoding: null });
return { src: url };
}
};
let state = 0;
let previousState = state;
const next = (root, el, data) => {
// If it is root, change to the first item.
if (el.parentNode == null) {
el = el.firstChild;
}
let value = el.innerHTML;
const isHeader = el => el.rawTagName.startsWith('h') && el.rawTagName.length === 2;
// Handle questions from H* elements.
if (state === 0) {
const question = {
from: 'document',
to: '',
subject1: '',
subject2: '',
subject3: '',
subject4: '',
content: value.replace(/["]+/g, ''),
instanceId: instance.instanceId,
skipIndex: 0,
packageId: packageId
};
data.questions.push(question);
previousState = state;
state = 1;
// Everything else is content for that Header.
} else if (state === 1) {
// If next element is null, the tree has been passed, so
// finish the append of other elements between the last Header
// and the end of the document.
if (!el.nextSibling || isHeader(el.nextSibling)) {
const answer = {
instanceId: instance.instanceId,
content: value,
format: '.html',
media: file.name,
packageId: packageId,
prevId: 0
};
data.answers.push(answer);
state = 0;
// Otherwise, just append content to insert later.
} else {
value += value;
}
}
// Goes to the next node, as it is all same level nodes.
if (el.nextSibling) {
next(root, el.nextSibling, data);
}
};
const html = await mammoth.convertToHtml(options);
const root = parse(html.value);
next(root, root, data);
}
// Persist to storage.
const answersCreated = await GuaribasAnswer.bulkCreate(data.answers);
let i = 0;
await CollectionUtil.asyncForEach(data.questions, async question => {
question.answerId = answersCreated[i++].answerId;
});
return await GuaribasQuestion.bulkCreate(data.questions);
});
}
/**
* Import all .docx files in reading comprehension folder.
*/
public async importDocs(
min: GBMinInstance,
localPath: string,
instance: IGBInstance,
packageId: number
): Promise<any> {
const files = await walkPromise(urlJoin(localPath, 'docs'));
if (!files[0]) {
GBLog.info(
`[GBDeployer] docs folder not created yet in .gbkb. To use Reading Comprehension, create this folder at root and put a document to get read by the.`
);
} else {
await CollectionUtil.asyncForEach(files, async file => {
let content = null;
let filePath = Path.join(file.root, file.name);
const document = await this.loadAndSplitFile(filePath);
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
const vectorStore = min['vectorStore'];
await vectorStore.addDocuments(flattenedDocuments);
await vectorStore.save(min['vectorStorePath']);
});
}
}
defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 700,
chunkOverlap: 50,
});
markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', {
chunkSize: 700,
chunkOverlap: 50,
});
private async loadAndSplitFile(filePath: string): Promise<Document<Record<string, unknown>>[]> {
const fileExtension = path.extname(filePath);
let loader;
let documents: Document<Record<string, unknown>>[];
switch (fileExtension) {
case '.json':
loader = new JSONLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.txt':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.md':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.markdownRecursiveCharacterTextSplitter);
break;
case '.pdf':
loader = new PDFLoader(filePath, { splitPages: false });
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.docx':
loader = new DocxLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.csv':
loader = new CSVLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.epub':
loader = new EPubLoader(filePath, { splitChapters: false });
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
default:
throw new Error(`Unsupported file extension: ${fileExtension}`);
}
return documents;
}
public async importKbTabularDirectory(localPath: string, min: GBMinInstance, packageId: number): Promise < any > {
const files = await walkPromise(localPath);
await CollectionUtil.asyncForEach(files, async file => {
if (file !== null && file.name.endsWith('.xlsx')) {
return await this.importKbTabularFile(urlJoin(file.root, file.name), min, packageId);
}
});
}
public async importSubjectFile(
packageId: number,
filename: string,
menuFile: string,
instance: IGBInstance
): Promise < any > {
let subjectsLoaded;
if(menuFile) {
// Loads menu.xlsx and finds worksheet.
const workbook = new Excel.Workbook();
const data = await workbook.xlsx.readFile(menuFile);
let worksheet: any;
for (let t = 0; t < data.worksheets.length; t++) {
worksheet = data.worksheets[t];
if (worksheet) {
break;
}
}
const MAX_LEVEL = 4; // Max column level to reach menu items in plan.
// Iterates over all items.
let rows = worksheet._rows;
rows.length = 24;
let lastLevel = 0;
let subjects = { children: [] };
let childrenNode = subjects.children;
let activeObj = null;
let activeChildrenGivenLevel = [childrenNode];
await asyncPromise.eachSeries(rows, async row => {
if (!row) return;
let menu;
// Detect menu level by skipping blank cells on left.
let level;
for (level = 0; level < MAX_LEVEL; level++) {
const cell = row._cells[level];
if (cell && cell.text) {
menu = cell.text;
break;
}
}
// Tree hierarchy calculation.
if (level > lastLevel) {
childrenNode = activeObj.children;
} else if (level < lastLevel) {
childrenNode = activeChildrenGivenLevel[level];
}
/// Keeps the record of last subroots for each level, to
// changel levels greater than one (return to main menu),
// can exists between leaf nodes and roots.
activeChildrenGivenLevel[level] = childrenNode;
// Insert the object into JSON.
const description = row._cells[level + 1] ? row._cells[level + 1].text : null;
activeObj = {
title: menu,
description: description,
id: menu,
children: []
};
activeChildrenGivenLevel[level].push(activeObj);
lastLevel = level;
});
subjectsLoaded = subjects;
} else {
subjectsLoaded = JSON.parse(Fs.readFileSync(filename, 'utf8'));
}
const doIt = async (subjects: GuaribasSubject[], parentSubjectId: number) => {
return asyncPromise.eachSeries(subjects, async item => {
const value = await GuaribasSubject.create(<GuaribasSubject>{
internalId: item.id,
parentSubjectId: parentSubjectId,
instanceId: instance.instanceId,
from: item.from,
to: item.to,
title: item.title,
description: item.description,
packageId: packageId
});
2018-09-10 16:24:32 -03:00
if (item.children) {
return doIt(item.children, value.subjectId);
} else {
return item;
}
});
};
return doIt(subjectsLoaded.children, undefined);
}
public async undeployKbFromStorage(instance: IGBInstance, deployer: GBDeployer, packageId: number) {
await GuaribasQuestion.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
await GuaribasAnswer.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
await GuaribasSubject.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
await this.undeployPackageFromStorage(instance, packageId);
}
2018-04-21 02:59:30 -03:00
public static async RefreshNER(min: GBMinInstance) {
const questions = await KBService.getQuestionsNER(min.instance.instanceId);
const contentLocale = min.core.getParam<string>(
min.instance,
'Default Content Language',
GBConfigService.get('DEFAULT_CONTENT_LANGUAGE')
);
await CollectionUtil.asyncForEach(questions, async question => {
const text = question.content;
const categoryReg = /.*\((.*)\).*/gi.exec(text);
const nameReg = /(\w+)\(.*\).*/gi.exec(text);
if (categoryReg) {
let category = categoryReg[1];
if (category === 'number') {
min['nerEngine'].addRegexEntity('number', 'pt', '/d+/gi');
}
if (nameReg) {
let name = nameReg[1];
min['nerEngine'].addNamedEntityText(category, name, [contentLocale], [name]);
2022-06-12 18:43:19 -03:00
}
}
});
}
2018-04-21 02:59:30 -03:00
/**
* Deploys a knowledge base to the storage using the .gbkb format.
*
* @param localPath Path to the .gbkb folder.
*/
public async deployKb(core: IGBCoreService, deployer: GBDeployer, localPath: string, min: GBMinInstance) {
const packageName = Path.basename(localPath);
const instance = await core.loadInstanceByBotId(min.botId);
GBLog.info(`[GBDeployer] Importing: ${localPath}`);
const p = await deployer.deployPackageToStorage(instance.instanceId, packageName);
await this.importKbPackage(min, localPath, p, instance);
GBDeployer.mountGBKBAssets(packageName, min.botId, localPath);
const service = await AzureDeployerService.createInstance(deployer);
const searchIndex = instance.searchIndex ? instance.searchIndex : GBServer.globals.minBoot.instance.searchIndex;
await deployer.rebuildIndex(instance, service.getKBSearchSchema(searchIndex));
min['groupCache'] = await KBService.getGroupReplies(instance.instanceId);
await KBService.RefreshNER(min);
GBLog.info(`[GBDeployer] Start Bot Server Side Rendering... ${localPath}`);
const html = await GBSSR.getHTML(min);
let path = DialogKeywords.getGBAIPath(min.botId, `gbui`);
path = Path.join(process.env.PWD, 'work', path, 'index.html');
GBLogEx.info(min, `[GBDeployer] Saving SSR HTML in ${path}.`);
Fs.writeFileSync(path, html, 'utf8');
GBLog.info(`[GBDeployer] Finished import of ${localPath}`);
}
2020-12-31 15:36:19 -03:00
private async playAudio(
min: GBMinInstance,
answer: GuaribasAnswer,
channel: string,
step: GBDialogStep,
conversationalService: IGBConversationalService
) {
conversationalService.sendAudio(min, step, answer.content);
}
2020-12-31 15:36:19 -03:00
private async playUrl(
min,
conversationalService: IGBConversationalService,
step: GBDialogStep,
url: string,
channel: string
) {
if (channel === 'whatsapp') {
await min.conversationalService.sendFile(min, step, null, url, '');
} else {
await conversationalService.sendEvent(min, step, 'play', {
playerType: 'url',
data: url
});
}
}
private async playVideo(
min,
conversationalService: IGBConversationalService,
step: GBDialogStep,
answer: GuaribasAnswer,
channel: string
) {
if (channel === 'whatsapp') {
await min.conversationalService.sendFile(min, step, null, answer.content, '');
} else {
const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`);
await conversationalService.sendEvent(min, step, 'play', {
playerType: 'video',
data: urlJoin(path, 'videos', answer.content)
});
2020-12-31 15:36:19 -03:00
}
}
2020-12-31 15:36:19 -03:00
private async undeployPackageFromStorage(instance: any, packageId: number) {
await GuaribasPackage.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
}
public async readComprehension(instanceId: number, doc: string, question: string) {
const url =
`http://${process.env.GBMODELS_SERVER}/reading-comprehension` +
new URLSearchParams({ question: question, key: process.env.GBMODELS_KEY });
const form = new FormData();
form.append('content', doc);
const options = {
body: form
};
GBLog.info(`[General Bots Models]: ReadComprehension for ${question}.`);
return await fetch(url, options);
}
private async getTextFromFile(filename: string) {
return new Promise<string>(async (resolve, reject) => {
textract.fromFileWithPath(filename, { preserveLineBreaks: true }, (error, text) => {
if (error) {
reject(error);
} else {
resolve(text);
}
});
});
}
2018-04-21 02:59:30 -03:00
}