botserver/packages/kb.gbapp/services/KBService.ts

1189 lines
38 KiB
TypeScript
Raw Normal View History

2018-04-21 02:59:30 -03:00
/*****************************************************************************\
2024-01-09 17:40:48 -03:00
| ® |
| |
| |
| |
| |
2018-04-21 02:59:30 -03:00
| |
| General Bots Copyright (c) pragmatismo.com.br. All rights reserved. |
2018-04-21 02:59:30 -03:00
| Licensed under the AGPL-3.0. |
2018-11-11 19:09:18 -02:00
| |
2018-04-21 02:59:30 -03:00
| According to our dual licensing model, this program can be used either |
| under the terms of the GNU Affero General Public License, version 3, |
| or under a proprietary license. |
| |
| The texts of the GNU Affero General Public License with an additional |
| permission and of our proprietary license can be found at and |
| in the LICENSE file you have received along with this program. |
| |
| This program is distributed in the hope that it will be useful, |
2018-09-11 19:40:53 -03:00
| but WITHOUT ANY WARRANTY, without even the implied warranty of |
2018-04-21 02:59:30 -03:00
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU Affero General Public License for more details. |
| |
2024-01-10 14:52:01 -03:00
| "General Bots" is a registered trademark of pragmatismo.com.br. |
2018-04-21 02:59:30 -03:00
| The licensing of the program under the AGPLv3 does not imply a |
| trademark license. Therefore any rights, title and interest in |
| our trademarks remain entirely with us. |
| |
\*****************************************************************************/
2018-11-27 22:56:11 -02:00
/**
* @fileoverview Knowledge base services and logic.
*/
import Path from 'path';
import Fs from 'fs';
import urlJoin from 'url-join';
import asyncPromise from 'async-promises';
import walkPromise from 'walk-promise';
import { SearchClient } from '@azure/search-documents';
import Excel from 'exceljs';
import getSlug from 'speakingurl';
import { GBServer } from '../../../src/app.js';
import { JSONLoader } from 'langchain/document_loaders/fs/json';
import { TextLoader } from 'langchain/document_loaders/fs/text';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { DocxLoader } from 'langchain/document_loaders/fs/docx';
import { EPubLoader } from 'langchain/document_loaders/fs/epub';
import { CSVLoader } from 'langchain/document_loaders/fs/csv';
import path from 'path';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from 'langchain/document';
2020-08-26 17:50:42 -03:00
import {
GBDialogStep,
GBLog,
2020-12-31 15:36:19 -03:00
GBMinInstance,
2020-08-26 17:50:42 -03:00
IGBConversationalService,
IGBCoreService,
IGBInstance,
2020-12-31 15:36:19 -03:00
IGBKBService
2020-08-26 17:50:42 -03:00
} from 'botlib';
2020-12-31 15:36:19 -03:00
import { CollectionUtil } from 'pragmatismo-io-framework';
import { Op } from 'sequelize';
import { Sequelize } from 'sequelize-typescript';
import { AzureDeployerService } from '../../azuredeployer.gbapp/services/AzureDeployerService.js';
import { GuaribasPackage } from '../../core.gbapp/models/GBModel.js';
import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js';
import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js';
import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js';
import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js';
import { parse } from 'node-html-parser';
import textract from 'textract';
import pdf from 'pdf-extraction';
import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js';
import mammoth from 'mammoth';
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
import { GBVMService } from '../../basic.gblib/services/GBVMService.js';
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js';
2023-07-09 14:51:46 -03:00
import { GBMinService } from '../../core.gbapp/services/GBMinService.js';
import { ChatServices } from '../../gpt.gblib/services/ChatServices.js';
2018-04-21 02:59:30 -03:00
/**
* Result for quey on KB data.
*/
2018-08-28 19:16:29 -03:00
export class KBServiceSearchResults {
2024-03-10 00:16:24 -03:00
public answer: string | GuaribasAnswer;
public questionId: number;
2018-08-28 19:16:29 -03:00
}
/**
* All services related to knowledge base management.
*/
export class KBService implements IGBKBService {
public sequelize: Sequelize;
2018-09-10 16:24:32 -03:00
constructor(sequelize: Sequelize) {
this.sequelize = sequelize;
2018-09-10 16:24:32 -03:00
}
public static getFormattedSubjectItems(subjects: GuaribasSubject[]) {
if (subjects === null) {
return '';
}
const out = [];
subjects.forEach(subject => {
out.push(subject.title);
});
2018-11-27 22:56:11 -02:00
return out.join(', ');
}
public static getSubjectItemsSeparatedBySpaces(subjects: GuaribasSubject[]) {
const out = [];
2020-08-26 17:50:42 -03:00
if (subjects === undefined) {
return '';
}
subjects.forEach(subject => {
out.push(subject.internalId);
});
2018-11-27 22:56:11 -02:00
return out.join(' ');
}
public async getAnswerTextByMediaName(instanceId: number, answerMediaName: string): Promise<string> {
const answer = await GuaribasAnswer.findOne({
where: {
instanceId: instanceId,
media: answerMediaName
}
});
2020-12-31 15:36:19 -03:00
return answer != undefined ? answer.content : null;
}
public async getQuestionById(instanceId: number, questionId: number): Promise<GuaribasQuestion> {
return GuaribasQuestion.findOne({
where: {
instanceId: instanceId,
questionId: questionId
}
});
}
public async getAnswerById(instanceId: number, answerId: number): Promise<GuaribasAnswer> {
return await GuaribasAnswer.findOne({
where: {
instanceId: instanceId,
answerId: answerId
}
});
2018-04-21 02:59:30 -03:00
}
/**
* Returns a question object given a SEO friendly URL.
*/
public async getQuestionIdFromURL(core: IGBCoreService, url: string) {
// Extracts questionId from URL.
const id = url.substr(url.lastIndexOf('-') + 1);
// Extracts botId from URL.
let path = /(http[s]?:\/\/)?([^\/\s]+\/)(.*)/gi;
const botId = url.replace(path, ($0, $1, $2, $3) => {
return $3.substr($3.indexOf('/'));
});
// Finds the associated question.
const instance = await core.loadInstanceByBotId(botId);
const question = await GuaribasQuestion.findAll({
where: {
instanceId: instance.instanceId,
questionId: id
}
});
return question;
}
public static async getQuestionsNER(instanceId: number) {
const where = {
instanceId: instanceId,
content: { [Op.like]: `%(%` }
};
const questions = await GuaribasQuestion.findAll({
where: where
});
return questions;
}
public async getQuestionsSEO(instanceId: number) {
const questions = await GuaribasQuestion.findAll({
where: {
instanceId: instanceId
}
});
let output = [];
for (let i = 0; i < questions.length; i++) {
const answer = questions[i];
const text = getSlug(answer.content);
let url = `${text}-${i}`;
output.push(url);
}
return output;
}
public async getDocs(instanceId: number) {
return await GuaribasAnswer.findAll({
where: {
instanceId: instanceId,
format: '.docx'
}
});
}
public async getAnswerByText(instanceId: number, text: string, from: string = null): Promise<any> {
text = text.trim();
const service = new CSService();
let question = await service.getQuestionFromAlternateText(instanceId, text);
if (!question) {
const where = {
instanceId: instanceId,
content: { [Op.like]: `%^[\w.]+${text}^[\w.]+%` }
};
if (from) {
where['from'] = from;
}
question = await GuaribasQuestion.findOne({
where: where
});
}
if (!question) {
let where = {
instanceId: instanceId,
content: { [Op.eq]: `${text}` }
};
question = await GuaribasQuestion.findOne({
where: where
});
}
2018-09-10 16:24:32 -03:00
if (question !== null) {
const answer = await GuaribasAnswer.findOne({
2018-09-10 16:24:32 -03:00
where: {
instanceId: instanceId,
answerId: question.answerId
}
});
2018-11-27 22:56:11 -02:00
2020-08-19 13:00:21 -03:00
return { question: question, answer: answer };
2018-09-10 16:24:32 -03:00
}
2018-11-27 22:56:11 -02:00
2020-08-19 13:00:21 -03:00
return undefined;
2018-04-21 02:59:30 -03:00
}
public async addAnswer(obj: GuaribasAnswer): Promise<GuaribasAnswer> {
2019-10-16 23:22:21 -03:00
return await GuaribasAnswer.create(obj);
2018-04-21 02:59:30 -03:00
}
public async ask(
min: GBMinInstance,
2024-03-11 13:30:11 -03:00
user,
step,
2024-03-04 20:05:56 -03:00
pid,
2018-09-10 16:24:32 -03:00
query: string,
2018-04-21 02:59:30 -03:00
searchScore: number,
2018-08-28 19:16:29 -03:00
subjects: GuaribasSubject[]
): Promise<KBServiceSearchResults> {
2018-09-09 14:39:37 -03:00
// Builds search query.
2018-08-28 19:16:29 -03:00
query = query.toLowerCase();
query = query.replace('?', ' ');
query = query.replace('!', ' ');
query = query.replace('.', ' ');
query = query.replace('/', ' ');
query = query.replace('\\', ' ');
query = query.replace('\r\n', ' ');
2018-08-28 19:16:29 -03:00
const instance = min.instance;
const contentLocale = min.core.getParam<string>(
min.instance,
'Default Content Language',
GBConfigService.get('DEFAULT_CONTENT_LANGUAGE')
);
query = await min.conversationalService.translate(
min,
query,
contentLocale
);
GBLog.info(`Translated query (prompt): ${query}.`);
// Try simple search first.
const data = await this.getAnswerByText(instance.instanceId, query.trim());
if (data) {
GBLog.info(`Simple SEARCH called.`);
return { answer: data.answer, questionId: data.question.questionId };
}
if (subjects !== null) {
const text = KBService.getSubjectItemsSeparatedBySpaces(subjects);
if (text !== null) {
query = `${query} ${text}`;
}
2018-09-09 14:39:37 -03:00
}
2024-03-06 14:38:37 -03:00
let returnedScore = 0;
const key = instance.searchKey ? instance.searchKey :
GBServer.globals.minBoot.instance.searchKey;
const host = instance.searchHost ? instance.searchHost :
GBServer.globals.minBoot.instance.searchHost;
// No direct match found, so Search is used.
if (key !== null && GBConfigService.get('STORAGE_DIALECT') === 'mssql') {
interface SearchResults {
instanceId: number;
questionId: number;
answerId: number;
content: string;
subject1: string;
subject2: string;
2022-12-15 23:03:20 -03:00
subject3: string;
subject4: string;
}
const client = new SearchClient<any>('https://' + host, 'azuresql-index', {
key: key
} as any);
const results = await client.search(query.substring(0, 499), {
filter: `instanceId eq ${instance.instanceId} and skipIndex eq false`,
2022-12-15 23:03:20 -03:00
searchFields: ['content', 'subject1', 'subject2', 'subject3', 'subject4'],
select: ['instanceId', 'questionId', 'answerId'],
skip: 0,
top: 1
});
2024-03-06 14:38:37 -03:00
// Searches via Search (Azure Search).
let found = false;
for await (const result of results.results) {
found = true;
returnedScore = result.score;
if (returnedScore >= searchScore) {
const value = await this.getAnswerById(instance.instanceId, result.document.answerId);
if (value !== null) {
GBLog.info(`SEARCH WILL BE USED with score: ${returnedScore} > required (searchScore): ${searchScore}`);
2020-12-31 15:36:19 -03:00
return { answer: value, questionId: result.document.questionId };
} else {
GBLog.info(
`Index problem. SEARCH WILL NOT be used as answerId ${result.document.answerId} was not found in database,
returnedScore: ${returnedScore} < required (searchScore): ${searchScore}`
);
return { answer: undefined, questionId: 0 };
}
}
}
}
2024-03-06 14:38:37 -03:00
GBLog.info(
2024-03-11 13:30:11 -03:00
`SEARCH returned LOW level score, calling NLP if any,
2024-03-06 14:38:37 -03:00
returnedScore: ${returnedScore} < required (searchScore): ${searchScore}`
);
2024-03-11 13:30:11 -03:00
return await ChatServices.answerByGPT(min, user, pid,
2024-03-06 14:38:37 -03:00
query,
searchScore,
subjects
);
2018-04-21 02:59:30 -03:00
}
public async getSubjectItems(instanceId: number, parentId: number): Promise<GuaribasSubject[]> {
const where = { parentSubjectId: parentId, instanceId: instanceId };
2018-11-27 22:56:11 -02:00
return GuaribasSubject.findAll({
where: where
});
2018-04-21 02:59:30 -03:00
}
public async getFaqBySubjectArray(instanceId: number, from: string, subjects: any): Promise<GuaribasQuestion[]> {
if (subjects) {
const where = {
from: from,
// tslint:disable-next-line: no-null-keyword
subject1: null,
// tslint:disable-next-line: no-null-keyword
subject2: null,
// tslint:disable-next-line: no-null-keyword
subject3: null,
// tslint:disable-next-line: no-null-keyword
subject4: null,
2021-02-28 12:26:44 -03:00
// tslint:disable-next-line: no-null-keyword
instanceId: instanceId
};
if (subjects[0] && subjects[0].internalId) {
where.subject1 = subjects[0].internalId;
}
2018-04-21 02:59:30 -03:00
if (subjects[1] && subjects[1].internalId) {
where.subject2 = subjects[1].internalId;
}
2018-04-21 02:59:30 -03:00
if (subjects[2] && subjects[2].internalId) {
where.subject3 = subjects[2].internalId;
}
2018-09-09 14:39:37 -03:00
if (subjects[3] && subjects[3].internalId) {
where.subject4 = subjects[3].internalId;
}
2018-11-27 22:56:11 -02:00
return await GuaribasQuestion.findAll({
where: where
});
} else {
return await GuaribasQuestion.findAll({
2021-02-28 13:06:05 -03:00
where: { from: from, instanceId: instanceId }
});
}
2018-04-21 02:59:30 -03:00
}
public static async getGroupReplies(instanceId: number): Promise<GuaribasQuestion[]> {
return await GuaribasQuestion.findAll({
where: { from: 'group', instanceId: instanceId }
});
}
public async importKbTabularFile(
2018-09-09 14:39:37 -03:00
filePath: string,
min: GBMinInstance,
2018-09-09 14:39:37 -03:00
packageId: number
): Promise<GuaribasQuestion[]> {
GBLog.info(`Now reading file ${filePath}...`);
2020-12-31 15:36:19 -03:00
const workbook = new Excel.Workbook();
const data = await workbook.xlsx.readFile(filePath);
let lastQuestionId: number;
let lastAnswer: GuaribasAnswer;
2020-08-26 17:50:42 -03:00
// Finds a valid worksheet because Excel returns empty slots
// when loading worksheets collection.
2020-08-26 17:50:42 -03:00
let worksheet: any;
for (let t = 0; t < data.worksheets.length; t++) {
worksheet = data.worksheets[t];
if (worksheet) {
2020-08-26 17:50:42 -03:00
break;
}
}
2020-12-31 15:36:19 -03:00
const rows = worksheet._rows;
const answers = [];
const questions = [];
GBLog.info(`Processing ${rows.length} rows from tabular file ${filePath}...`);
await asyncPromise.eachSeries(rows, async line => {
// Skips the first line.
2020-08-26 17:50:42 -03:00
if (
line != undefined &&
line._cells[0] !== undefined &&
line._cells[1] !== undefined &&
line._cells[2] !== undefined &&
line._cells[3] !== undefined &&
2020-08-26 17:50:42 -03:00
line._cells[4] !== undefined
) {
// Extracts values from columns in the current line.
const subjectsText = line._cells[0].text;
const from = line._cells[1].text;
const to = line._cells[2].text;
const question = line._cells[3].text.trim();
let answer = line._cells[4].text.trim();
if (
!(subjectsText === 'subjects' && from === 'from') &&
answer !== null &&
question !== null &&
answer !== '' &&
question !== ''
) {
let format = '.txt';
2018-09-09 18:11:41 -03:00
// Extracts answer from external media if any.
let media = null;
2020-08-26 17:50:42 -03:00
if (typeof answer !== 'string') {
GBLog.info(`[GBImporter] Answer is NULL related to Question '${question}'.`);
2020-08-26 17:50:42 -03:00
answer =
'Existe um problema na base de conhecimento. Fui treinado para entender sua pergunta, avise a quem me criou que a resposta não foi informada para esta pergunta.';
} else if (answer.indexOf('.md') > -1 || answer.indexOf('.docx') > -1) {
const mediaFilename = urlJoin(path.dirname(filePath), '..', 'articles', answer);
if (Fs.existsSync(mediaFilename)) {
// Tries to load .docx file from Articles folder.
if (answer.indexOf('.docx') > -1) {
answer = await this.getTextFromFile(filePath);
} else {
// Loads normally markdown file.
answer = Fs.readFileSync(mediaFilename, 'utf8');
}
format = '.md';
media = path.basename(mediaFilename);
} else {
if (answer.indexOf('.md') > -1) {
GBLog.info(`[GBImporter] File not found: ${mediaFilename}.`);
answer = '';
}
}
}
2018-09-10 12:09:48 -03:00
// Processes subjects hierarchy splitting by dots.
const subjectArray = subjectsText.split('.');
let subject1: string;
let subject2: string;
let subject3: string;
let subject4: string;
let indexer = 0;
subjectArray.forEach(element => {
if (indexer === 0) {
subject1 = subjectArray[indexer].substring(0, 63);
} else if (indexer === 1) {
subject2 = subjectArray[indexer].substring(0, 63);
} else if (indexer === 2) {
subject3 = subjectArray[indexer].substring(0, 63);
} else if (indexer === 3) {
subject4 = subjectArray[indexer].substring(0, 63);
}
indexer++;
});
// Skips blank answers.
if (answer && answer.trim() === '') {
return false;
}
// In case of code cell, compiles it and associate with the answer.
answer = GBVMService.normalizeQuotes(answer);
const isBasic = answer.toLowerCase().startsWith('/basic');
if (/TALK\s*\".*\"/gi.test(answer) || isBasic) {
const code = isBasic ? answer.substr(6) : answer;
const path = DialogKeywords.getGBAIPath(min.botId, `gbdialog`);
const scriptName = `tmp${GBAdminService.getRndReadableIdentifier()}.docx`;
const localName = Path.join('work', path, `${scriptName}`);
Fs.writeFileSync(localName, code, { encoding: null });
answer = scriptName;
const vm = new GBVMService();
await vm.loadDialog(Path.basename(localName), Path.dirname(localName), min);
}
// Now with all the data ready, creates entities in the store.
const answer1 = {
instanceId: min.instance.instanceId,
content: answer,
format: format,
media: media,
packageId: packageId,
prevId: lastQuestionId !== null ? lastQuestionId : 0
};
answers.push(answer1);
const question1 = {
from: from,
to: to,
subject1: subject1,
subject2: subject2,
subject3: subject3,
subject4: subject4,
content: question.replace(/["]+/g, ''),
instanceId: min.instance.instanceId,
skipIndex: question.charAt(0) === '"',
packageId: packageId
};
questions.push(question1);
2022-12-26 13:37:11 -03:00
// https://github.com/GeneralBots/BotServer/issues/312
// if (lastAnswer !== undefined && lastQuestionId !== 0) {
// await lastAnswer.update({ nextId: lastQuestionId });
// }
// lastAnswer = answer1;
// lastQuestionId = question1.questionId;
return true;
} else {
// Skips the header.
2020-08-19 13:00:21 -03:00
return undefined;
}
}
});
const answersCreated = await GuaribasAnswer.bulkCreate(answers);
let i = 0;
await CollectionUtil.asyncForEach(questions, async question => {
question.answerId = answersCreated[i++].answerId;
});
2020-12-31 15:36:19 -03:00
return await GuaribasQuestion.bulkCreate(questions);
2018-04-21 02:59:30 -03:00
}
public async sendAnswer(min: GBMinInstance, channel: string, step: GBDialogStep, answer) {
answer = typeof (answer) === 'string' ? answer : answer.content;
if (answer.endsWith('.mp4')) {
await this.playVideo(min, min.conversationalService, step, answer, channel);
} else if (
answer.endsWith('.ppt') ||
answer.endsWith('.pptx') ||
answer.endsWith('.doc') ||
answer.endsWith('.docx') ||
answer.endsWith('.xls') ||
answer.endsWith('.xlsx')
) {
const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`);
const doc = urlJoin(GBServer.globals.publicAddress, 'kb', path, 'assets', answer);
const url = `http://view.officeapps.live.com/op/view.aspx?src=${doc}`;
await this.playUrl(min, min.conversationalService, step, url, channel);
} else if (answer.endsWith('.pdf')) {
const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`);
const url = urlJoin('kb', path, 'assets', answer);
await this.playUrl(min, min.conversationalService, step, url, channel);
2020-08-26 17:50:42 -03:00
} else if (answer.format === '.md') {
await min.conversationalService['playMarkdown'](min, answer, channel, step,
2023-07-09 14:51:46 -03:00
GBMinService.userMobile(step));
} else if (answer.endsWith('.ogg') && process.env.AUDIO_DISABLED !== 'true') {
await this.playAudio(min, answer, channel, step, min.conversationalService);
} else {
await min.conversationalService.sendText(min, step, answer);
await min.conversationalService.sendEvent(min, step, 'stop', undefined);
}
}
public async addQA(min, questionText, answerText) {
const pkg = await GuaribasPackage.findOne({
where: { instanceId: min.instance.instanceId }
});
const question = {
from: 'autodialog',
to: '',
subject1: '',
subject2: '',
subject3: '',
subject4: '',
content: questionText.replace(/["]+/g, ''),
instanceId: min.instance.instanceId,
skipIndex: false,
packageId: pkg.packageId
};
const answer = {
instanceId: min.instance.instanceId,
content: answerText,
format: '.txt',
media: null,
packageId: pkg.packageId,
prevId: 0
};
const a = await GuaribasAnswer.create(answer);
question['answerId'] = a.answerId;
const q = await GuaribasQuestion.create(question);
}
public async importKbPackage(
min: GBMinInstance,
localPath: string,
packageStorage: GuaribasPackage,
instance: IGBInstance
): Promise<any> {
// Imports subjects tree into database and return it.
2018-09-09 14:39:37 -03:00
const subjectFile = urlJoin(localPath, 'subjects.json');
const menuFile = urlJoin(localPath, 'menu.xlsx');
// Imports menu.xlsx if any.
if (Fs.existsSync(subjectFile) || Fs.existsSync(menuFile)) {
await this.importSubjectFile(packageStorage.packageId, subjectFile, menuFile, instance);
}
2018-09-09 14:39:37 -03:00
// Import tabular files in the tabular directory.
await this.importKbTabularDirectory(localPath, min, packageStorage.packageId);
2018-04-21 02:59:30 -03:00
// Import remaining .md files in articles directory.
await this.importRemainingArticles(localPath, instance, packageStorage.packageId);
// Import docs files in .docx directory.
return await this.importDocs(min, localPath, instance, packageStorage.packageId);
}
2018-09-09 18:11:41 -03:00
/**
* Import all .md files in articles folder that has not been referenced by tabular files.
*/
public async importRemainingArticles(localPath: string, instance: IGBInstance, packageId: number): Promise<any> {
const files = await walkPromise(urlJoin(localPath, 'articles'));
const data = { questions: [], answers: [] };
await CollectionUtil.asyncForEach(files, async file => {
if (file !== null && file.name.endsWith('.md')) {
let content = await this.getAnswerTextByMediaName(instance.instanceId, file.name);
if (content === null) {
const fullFilename = urlJoin(file.root, file.name);
content = Fs.readFileSync(fullFilename, 'utf-8');
2022-01-03 13:11:21 -03:00
await GuaribasAnswer.create(<GuaribasAnswer>{
instanceId: instance.instanceId,
content: content,
2020-08-26 17:50:42 -03:00
format: '.md',
media: file.name,
packageId: packageId,
2022-12-26 13:37:11 -03:00
prevId: 0 // https://github.com/GeneralBots/BotServer/issues/312
});
}
} else if (file !== null && file.name.endsWith('.docx')) {
const path = DialogKeywords.getGBAIPath(instance.botId, `gbkb`);
const localName = Path.join('work', path, 'articles', file.name);
const buffer = Fs.readFileSync(localName, { encoding: null });
var options = {
buffer: buffer,
convertImage: async image => {
const localName = Path.join(
'work',
DialogKeywords.getGBAIPath(instance.botId),
'cache',
`img-docx${GBAdminService.getRndReadableIdentifier()}.png`
);
const url = urlJoin(
GBServer.globals.publicAddress,
DialogKeywords.getGBAIPath(instance.botId).replace(/\.[^/.]+$/, ''),
'cache',
Path.basename(localName)
);
const buffer = await image.read();
Fs.writeFileSync(localName, buffer, { encoding: null });
return { src: url };
}
};
let state = 0;
let previousState = state;
const next = (root, el, data) => {
// If it is root, change to the first item.
if (el.parentNode == null) {
el = el.firstChild;
}
let value = el.innerHTML;
const isHeader = el => el.rawTagName.startsWith('h') && el.rawTagName.length === 2;
// Handle questions from H* elements.
if (state === 0) {
const question = {
from: 'document',
to: '',
subject1: '',
subject2: '',
subject3: '',
subject4: '',
content: value.replace(/["]+/g, ''),
instanceId: instance.instanceId,
skipIndex: 0,
packageId: packageId
};
data.questions.push(question);
previousState = state;
state = 1;
// Everything else is content for that Header.
} else if (state === 1) {
// If next element is null, the tree has been passed, so
// finish the append of other elements between the last Header
// and the end of the document.
if (!el.nextSibling || isHeader(el.nextSibling)) {
const answer = {
instanceId: instance.instanceId,
content: value,
format: '.html',
media: file.name,
packageId: packageId,
prevId: 0
};
data.answers.push(answer);
state = 0;
// Otherwise, just append content to insert later.
} else {
value += value;
}
}
// Goes to the next node, as it is all same level nodes.
if (el.nextSibling) {
next(root, el.nextSibling, data);
}
};
const html = await mammoth.convertToHtml(options);
const root = parse(html.value);
next(root, root, data);
}
// Persist to storage.
const answersCreated = await GuaribasAnswer.bulkCreate(data.answers);
let i = 0;
await CollectionUtil.asyncForEach(data.questions, async question => {
question.answerId = answersCreated[i++].answerId;
});
return await GuaribasQuestion.bulkCreate(data.questions);
});
}
/**
* Import all .docx files in reading comprehension folder.
*/
public async importDocs(
min: GBMinInstance,
localPath: string,
instance: IGBInstance,
packageId: number
): Promise<any> {
const files = await walkPromise(urlJoin(localPath, 'docs'));
if (!files[0]) {
GBLog.info(
`[GBDeployer] docs folder not created yet in .gbkb. To use Reading Comprehension, create this folder at root and put a document to get read by the.`
);
} else {
await CollectionUtil.asyncForEach(files, async file => {
let content = null;
let filePath = Path.join(file.root, file.name);
const document = await this.loadAndSplitFile(filePath);
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
const vectorStore = min['vectorStore'];
await vectorStore.addDocuments(flattenedDocuments);
await vectorStore.save(min['vectorStorePath']);
});
}
}
defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 700,
chunkOverlap: 50,
});
markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.fromLanguage('markdown', {
chunkSize: 700,
chunkOverlap: 50,
});
private async loadAndSplitFile(filePath: string): Promise<Document<Record<string, unknown>>[]> {
const fileExtension = path.extname(filePath);
let loader;
let documents: Document<Record<string, unknown>>[];
switch (fileExtension) {
case '.json':
loader = new JSONLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.txt':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.md':
loader = new TextLoader(filePath);
documents = await loader.loadAndSplit(this.markdownRecursiveCharacterTextSplitter);
break;
case '.pdf':
loader = new PDFLoader(filePath, { splitPages: false });
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.docx':
loader = new DocxLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.csv':
loader = new CSVLoader(filePath);
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
case '.epub':
loader = new EPubLoader(filePath, { splitChapters: false });
documents = await loader.loadAndSplit(this.defaultRecursiveCharacterTextSplitter);
break;
default:
throw new Error(`Unsupported file extension: ${fileExtension}`);
}
return documents;
}
public async importKbTabularDirectory(localPath: string, min: GBMinInstance, packageId: number): Promise < any > {
const files = await walkPromise(localPath);
await CollectionUtil.asyncForEach(files, async file => {
if (file !== null && file.name.endsWith('.xlsx')) {
return await this.importKbTabularFile(urlJoin(file.root, file.name), min, packageId);
}
});
}
public async importSubjectFile(
packageId: number,
filename: string,
menuFile: string,
instance: IGBInstance
): Promise < any > {
let subjectsLoaded;
if(menuFile) {
// Loads menu.xlsx and finds worksheet.
const workbook = new Excel.Workbook();
const data = await workbook.xlsx.readFile(menuFile);
let worksheet: any;
for (let t = 0; t < data.worksheets.length; t++) {
worksheet = data.worksheets[t];
if (worksheet) {
break;
}
}
const MAX_LEVEL = 4; // Max column level to reach menu items in plan.
// Iterates over all items.
let rows = worksheet._rows;
rows.length = 24;
let lastLevel = 0;
let subjects = { children: [] };
let childrenNode = subjects.children;
let activeObj = null;
let activeChildrenGivenLevel = [childrenNode];
await asyncPromise.eachSeries(rows, async row => {
if (!row) return;
let menu;
// Detect menu level by skipping blank cells on left.
let level;
for (level = 0; level < MAX_LEVEL; level++) {
const cell = row._cells[level];
if (cell && cell.text) {
menu = cell.text;
break;
}
}
// Tree hierarchy calculation.
if (level > lastLevel) {
childrenNode = activeObj.children;
} else if (level < lastLevel) {
childrenNode = activeChildrenGivenLevel[level];
}
/// Keeps the record of last subroots for each level, to
// changel levels greater than one (return to main menu),
// can exists between leaf nodes and roots.
activeChildrenGivenLevel[level] = childrenNode;
// Insert the object into JSON.
const description = row._cells[level + 1] ? row._cells[level + 1].text : null;
activeObj = {
title: menu,
description: description,
id: menu,
children: []
};
activeChildrenGivenLevel[level].push(activeObj);
lastLevel = level;
});
subjectsLoaded = subjects;
} else {
subjectsLoaded = JSON.parse(Fs.readFileSync(filename, 'utf8'));
}
const doIt = async (subjects: GuaribasSubject[], parentSubjectId: number) => {
return asyncPromise.eachSeries(subjects, async item => {
const value = await GuaribasSubject.create(<GuaribasSubject>{
internalId: item.id,
parentSubjectId: parentSubjectId,
instanceId: instance.instanceId,
from: item.from,
to: item.to,
title: item.title,
description: item.description,
packageId: packageId
});
2018-09-10 16:24:32 -03:00
if (item.children) {
return doIt(item.children, value.subjectId);
} else {
return item;
}
});
};
return doIt(subjectsLoaded.children, undefined);
}
public async undeployKbFromStorage(instance: IGBInstance, deployer: GBDeployer, packageId: number) {
await GuaribasQuestion.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
await GuaribasAnswer.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
await GuaribasSubject.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
await this.undeployPackageFromStorage(instance, packageId);
}
2018-04-21 02:59:30 -03:00
public static async RefreshNER(min: GBMinInstance) {
const questions = await KBService.getQuestionsNER(min.instance.instanceId);
const contentLocale = min.core.getParam<string>(
min.instance,
'Default Content Language',
GBConfigService.get('DEFAULT_CONTENT_LANGUAGE')
);
await CollectionUtil.asyncForEach(questions, async question => {
const text = question.content;
const categoryReg = /.*\((.*)\).*/gi.exec(text);
const nameReg = /(\w+)\(.*\).*/gi.exec(text);
if (categoryReg) {
let category = categoryReg[1];
if (category === 'number') {
min['nerEngine'].addRegexEntity('number', 'pt', '/d+/gi');
}
if (nameReg) {
let name = nameReg[1];
min['nerEngine'].addNamedEntityText(category, name, [contentLocale], [name]);
2022-06-12 18:43:19 -03:00
}
}
});
}
2018-04-21 02:59:30 -03:00
/**
* Deploys a knowledge base to the storage using the .gbkb format.
*
* @param localPath Path to the .gbkb folder.
*/
public async deployKb(core: IGBCoreService, deployer: GBDeployer, localPath: string, min: GBMinInstance) {
const packageName = Path.basename(localPath);
const instance = await core.loadInstanceByBotId(min.botId);
GBLog.info(`[GBDeployer] Importing: ${localPath}`);
const p = await deployer.deployPackageToStorage(instance.instanceId, packageName);
await this.importKbPackage(min, localPath, p, instance);
GBDeployer.mountGBKBAssets(packageName, min.botId, localPath);
const service = await AzureDeployerService.createInstance(deployer);
const searchIndex = instance.searchIndex ? instance.searchIndex : GBServer.globals.minBoot.instance.searchIndex;
await deployer.rebuildIndex(instance, service.getKBSearchSchema(searchIndex));
min['groupCache'] = await KBService.getGroupReplies(instance.instanceId);
await KBService.RefreshNER(min);
GBLog.info(`[GBDeployer] Start Bot Server Side Rendering... ${localPath}`);
const html = await GBSSR.getHTML(min);
let path = DialogKeywords.getGBAIPath(min.botId, `gbui`);
path = Path.join(process.env.PWD, 'work', path, 'index.html');
GBLogEx.info(min, `[GBDeployer] Saving SSR HTML in ${path}.`);
Fs.writeFileSync(path, html, 'utf8');
GBLog.info(`[GBDeployer] Finished import of ${localPath}`);
}
2020-12-31 15:36:19 -03:00
private async playAudio(
min: GBMinInstance,
answer: GuaribasAnswer,
channel: string,
step: GBDialogStep,
conversationalService: IGBConversationalService
) {
conversationalService.sendAudio(min, step, answer.content);
}
2020-12-31 15:36:19 -03:00
private async playUrl(
min,
conversationalService: IGBConversationalService,
step: GBDialogStep,
url: string,
channel: string
) {
if (channel === 'whatsapp') {
await min.conversationalService.sendFile(min, step, null, url, '');
} else {
await conversationalService.sendEvent(min, step, 'play', {
playerType: 'url',
data: url
});
}
}
private async playVideo(
min,
conversationalService: IGBConversationalService,
step: GBDialogStep,
answer: GuaribasAnswer,
channel: string
) {
if (channel === 'whatsapp') {
await min.conversationalService.sendFile(min, step, null, answer.content, '');
} else {
const path = DialogKeywords.getGBAIPath(min.botId, `gbkb`);
await conversationalService.sendEvent(min, step, 'play', {
playerType: 'video',
data: urlJoin(path, 'videos', answer.content)
});
2020-12-31 15:36:19 -03:00
}
}
2020-12-31 15:36:19 -03:00
private async undeployPackageFromStorage(instance: any, packageId: number) {
await GuaribasPackage.destroy({
where: { instanceId: instance.instanceId, packageId: packageId }
});
}
public async readComprehension(instanceId: number, doc: string, question: string) {
const url =
`http://${process.env.GBMODELS_SERVER}/reading-comprehension` +
new URLSearchParams({ question: question, key: process.env.GBMODELS_KEY });
const form = new FormData();
form.append('content', doc);
const options = {
body: form
};
GBLog.info(`[General Bots Models]: ReadComprehension for ${question}.`);
return await fetch(url, options);
}
private async getTextFromFile(filename: string) {
return new Promise<string>(async (resolve, reject) => {
textract.fromFileWithPath(filename, { preserveLineBreaks: true }, (error, text) => {
if (error) {
reject(error);
} else {
resolve(text);
}
});
});
}
2018-04-21 02:59:30 -03:00
}