2018-04-21 02:59:30 -03:00
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * \
2024-01-09 17:40:48 -03:00
| █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ ® |
| █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ |
| █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ |
| █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ |
| █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ █ |
2018-04-21 02:59:30 -03:00
| |
2024-08-17 20:30:00 -03:00
| General Bots Copyright ( c ) pragmatismo . cloud . All rights reserved . |
2018-04-21 02:59:30 -03:00
| Licensed under the AGPL - 3.0 . |
2018-11-11 19:09:18 -02:00
| |
2018-04-21 02:59:30 -03:00
| According to our dual licensing model , this program can be used either |
| under the terms of the GNU Affero General Public License , version 3 , |
| or under a proprietary license . |
| |
| The texts of the GNU Affero General Public License with an additional |
| permission and of our proprietary license can be found at and |
| in the LICENSE file you have received along with this program . |
| |
| This program is distributed in the hope that it will be useful , |
2018-09-11 19:40:53 -03:00
| but WITHOUT ANY WARRANTY , without even the implied warranty of |
2018-04-21 02:59:30 -03:00
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the |
| GNU Affero General Public License for more details . |
| |
2024-08-17 20:30:00 -03:00
| "General Bots" is a registered trademark of pragmatismo . cloud . |
2018-04-21 02:59:30 -03:00
| The licensing of the program under the AGPLv3 does not imply a |
| trademark license . Therefore any rights , title and interest in |
| our trademarks remain entirely with us . |
| |
\ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
2018-11-27 22:56:11 -02:00
/ * *
* @fileoverview Knowledge base services and logic .
* /
2022-11-19 19:50:19 -03:00
import { SearchClient } from '@azure/search-documents' ;
2024-09-08 16:48:26 -03:00
import asyncPromise from 'async-promises' ;
2022-11-18 22:39:14 -03:00
import Excel from 'exceljs' ;
2024-09-08 16:48:26 -03:00
import fs from 'fs/promises' ;
import html2md from 'html-to-md' ;
2024-01-29 21:04:53 -03:00
import { JSONLoader } from 'langchain/document_loaders/fs/json' ;
import { TextLoader } from 'langchain/document_loaders/fs/text' ;
2024-09-08 16:48:26 -03:00
import path from 'path' ;
import getSlug from 'speakingurl' ;
import urlJoin from 'url-join' ;
import walkPromise from 'walk-promise' ;
import { GBServer } from '../../../src/app.js' ;
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv' ;
2024-05-25 19:11:01 -03:00
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx' ;
import { EPubLoader } from '@langchain/community/document_loaders/fs/epub' ;
2024-09-08 16:48:26 -03:00
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf' ;
2024-11-08 06:49:30 -03:00
import { rimraf } from 'rimraf' ;
2024-09-06 15:30:03 -03:00
2024-05-17 19:19:58 -03:00
import getColors from 'get-image-colors' ;
2024-09-08 16:48:26 -03:00
import { Document } from 'langchain/document' ;
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter' ;
import puppeteer , { Page } from 'puppeteer' ;
2024-11-26 09:35:15 -03:00
import { Jimp } from 'jimp' ;
2020-08-26 17:50:42 -03:00
import {
GBDialogStep ,
GBLog ,
2020-12-31 15:36:19 -03:00
GBMinInstance ,
2020-08-26 17:50:42 -03:00
IGBConversationalService ,
IGBCoreService ,
IGBInstance ,
2020-12-31 15:36:19 -03:00
IGBKBService
2020-08-26 17:50:42 -03:00
} from 'botlib' ;
2024-09-08 16:48:26 -03:00
import mammoth from 'mammoth' ;
import { parse } from 'node-html-parser' ;
import pdf from 'pdf-extraction' ;
2020-12-31 15:36:19 -03:00
import { CollectionUtil } from 'pragmatismo-io-framework' ;
2019-12-31 16:12:06 -03:00
import { Op } from 'sequelize' ;
2018-11-12 12:20:44 -02:00
import { Sequelize } from 'sequelize-typescript' ;
2024-09-08 16:48:26 -03:00
import textract from 'textract' ;
import { GBUtil } from '../../../src/util.js' ;
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js' ;
2022-11-18 22:39:14 -03:00
import { AzureDeployerService } from '../../azuredeployer.gbapp/services/AzureDeployerService.js' ;
2024-09-08 16:48:26 -03:00
import { DialogKeywords } from '../../basic.gblib/services/DialogKeywords.js' ;
import { GBVMService } from '../../basic.gblib/services/GBVMService.js' ;
2022-11-18 22:39:14 -03:00
import { GuaribasPackage } from '../../core.gbapp/models/GBModel.js' ;
import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js' ;
2023-02-20 10:29:04 -03:00
import { GBLogEx } from '../../core.gbapp/services/GBLogEx.js' ;
2023-07-09 14:51:46 -03:00
import { GBMinService } from '../../core.gbapp/services/GBMinService.js' ;
2024-09-08 16:48:26 -03:00
import { GBSSR } from '../../core.gbapp/services/GBSSR.js' ;
import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js' ;
2024-08-23 17:23:22 -03:00
import { ChatServices } from '../../llm.gblib/services/ChatServices.js' ;
2024-09-08 16:48:26 -03:00
import { GuaribasAnswer , GuaribasQuestion , GuaribasSubject } from '../models/index.js' ;
import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js' ;
2024-02-05 12:36:20 -03:00
2019-03-08 19:13:00 -03:00
/ * *
* Result for quey on KB data .
* /
2018-08-28 19:16:29 -03:00
export class KBServiceSearchResults {
2024-03-10 00:16:24 -03:00
public answer : string | GuaribasAnswer ;
2018-11-12 12:20:44 -02:00
public questionId : number ;
2018-08-28 19:16:29 -03:00
}
2019-03-08 19:13:00 -03:00
/ * *
* All services related to knowledge base management .
* /
2020-03-30 14:03:12 -03:00
export class KBService implements IGBKBService {
2018-11-12 12:20:44 -02:00
public sequelize : Sequelize ;
2018-09-10 16:24:32 -03:00
2022-11-30 09:40:09 -03:00
constructor ( sequelize : Sequelize ) {
2018-11-12 12:20:44 -02:00
this . sequelize = sequelize ;
2018-09-10 16:24:32 -03:00
}
2022-11-30 09:40:09 -03:00
public static getFormattedSubjectItems ( subjects : GuaribasSubject [ ] ) {
2019-04-08 11:30:01 -03:00
if ( subjects === null ) {
2019-02-11 11:25:00 -02:00
return '' ;
}
2018-11-12 12:20:44 -02:00
const out = [ ] ;
subjects . forEach ( subject = > {
out . push ( subject . title ) ;
} ) ;
2018-11-27 22:56:11 -02:00
2018-11-12 12:20:44 -02:00
return out . join ( ', ' ) ;
}
2022-11-30 09:40:09 -03:00
public static getSubjectItemsSeparatedBySpaces ( subjects : GuaribasSubject [ ] ) {
2018-11-12 12:20:44 -02:00
const out = [ ] ;
2020-08-26 17:50:42 -03:00
if ( subjects === undefined ) {
return '' ;
}
2018-11-12 12:20:44 -02:00
subjects . forEach ( subject = > {
out . push ( subject . internalId ) ;
} ) ;
2018-11-27 22:56:11 -02:00
2018-11-12 12:20:44 -02:00
return out . join ( ' ' ) ;
}
2022-11-30 09:40:09 -03:00
public async getAnswerTextByMediaName ( instanceId : number , answerMediaName : string ) : Promise < string > {
2020-03-30 14:03:12 -03:00
const answer = await GuaribasAnswer . findOne ( {
where : {
instanceId : instanceId ,
media : answerMediaName
}
} ) ;
2020-12-31 15:36:19 -03:00
return answer != undefined ? answer.content : null ;
2020-03-30 14:03:12 -03:00
}
2022-11-30 09:40:09 -03:00
public async getQuestionById ( instanceId : number , questionId : number ) : Promise < GuaribasQuestion > {
2018-09-20 12:35:47 -03:00
return GuaribasQuestion . findOne ( {
where : {
instanceId : instanceId ,
questionId : questionId
}
2018-11-12 12:20:44 -02:00
} ) ;
2018-09-20 12:35:47 -03:00
}
2022-11-30 09:40:09 -03:00
public async getAnswerById ( instanceId : number , answerId : number ) : Promise < GuaribasAnswer > {
2023-02-27 14:34:38 -03:00
return await GuaribasAnswer . findOne ( {
2018-09-20 12:35:47 -03:00
where : {
instanceId : instanceId ,
answerId : answerId
}
2018-11-12 12:20:44 -02:00
} ) ;
2018-04-21 02:59:30 -03:00
}
2021-01-20 18:23:42 -03:00
/ * *
* Returns a question object given a SEO friendly URL .
* /
2022-11-30 09:40:09 -03:00
public async getQuestionIdFromURL ( core : IGBCoreService , url : string ) {
2021-01-20 18:23:42 -03:00
// Extracts questionId from URL.
const id = url . substr ( url . lastIndexOf ( '-' ) + 1 ) ;
2021-03-31 08:40:51 -03:00
2021-01-20 18:23:42 -03:00
// Extracts botId from URL.
2021-03-31 08:40:51 -03:00
2024-09-07 00:08:23 -03:00
let packagePath = /(http[s]?:\/\/)?([^\/\s]+\/)(.*)/gi ;
const botId = url . replace ( packagePath , ( $0 , $1 , $2 , $3 ) = > {
2021-01-20 18:23:42 -03:00
return $3 . substr ( $3 . indexOf ( '/' ) ) ;
} ) ;
// Finds the associated question.
const instance = await core . loadInstanceByBotId ( botId ) ;
const question = await GuaribasQuestion . findAll ( {
where : {
instanceId : instance.instanceId ,
questionId : id
}
} ) ;
return question ;
}
2022-11-30 09:40:09 -03:00
public static async getQuestionsNER ( instanceId : number ) {
2022-06-06 18:03:02 -03:00
const where = {
instanceId : instanceId ,
content : { [ Op . like ] : ` %(% ` }
} ;
const questions = await GuaribasQuestion . findAll ( {
where : where
} ) ;
return questions ;
}
2021-01-20 18:23:42 -03:00
2022-11-30 09:40:09 -03:00
public async getQuestionsSEO ( instanceId : number ) {
2021-01-20 18:23:42 -03:00
const questions = await GuaribasQuestion . findAll ( {
where : {
instanceId : instanceId
}
} ) ;
let output = [ ] ;
for ( let i = 0 ; i < questions . length ; i ++ ) {
const answer = questions [ i ] ;
const text = getSlug ( answer . content ) ;
let url = ` ${ text } - ${ i } ` ;
output . push ( url ) ;
}
return output ;
}
2022-11-30 09:40:09 -03:00
public async getDocs ( instanceId : number ) {
2021-07-16 08:12:58 -03:00
return await GuaribasAnswer . findAll ( {
where : {
instanceId : instanceId ,
format : '.docx'
}
} ) ;
}
2022-11-30 09:40:09 -03:00
public async getAnswerByText ( instanceId : number , text : string , from : string = null ) : Promise < any > {
2020-01-29 10:48:51 -03:00
text = text . trim ( ) ;
2021-07-16 08:12:58 -03:00
2020-01-29 10:48:51 -03:00
const service = new CSService ( ) ;
let question = await service . getQuestionFromAlternateText ( instanceId , text ) ;
2021-03-31 08:40:51 -03:00
if ( ! question ) {
2022-06-06 18:03:02 -03:00
const where = {
2021-12-25 22:13:52 -03:00
instanceId : instanceId ,
2022-08-26 18:07:58 -03:00
content : { [ Op . like ] : ` %^[ \ w.]+ ${ text } ^[ \ w.]+% ` }
2021-12-25 22:13:52 -03:00
} ;
2022-06-06 18:03:02 -03:00
if ( from ) {
where [ 'from' ] = from ;
2021-12-25 22:13:52 -03:00
}
2020-01-29 10:48:51 -03:00
question = await GuaribasQuestion . findOne ( {
2021-12-25 22:13:52 -03:00
where : where
2020-01-29 10:48:51 -03:00
} ) ;
}
2021-04-27 13:42:47 -03:00
if ( ! question ) {
2022-08-26 18:37:35 -03:00
let where = {
instanceId : instanceId ,
content : { [ Op . eq ] : ` ${ text } ` }
2022-11-19 19:50:19 -03:00
} ;
2022-08-26 18:37:35 -03:00
question = await GuaribasQuestion . findOne ( {
where : where
} ) ;
2021-04-27 13:42:47 -03:00
}
2018-09-10 16:24:32 -03:00
2019-03-08 19:13:00 -03:00
if ( question !== null ) {
2018-11-12 12:20:44 -02:00
const answer = await GuaribasAnswer . findOne ( {
2018-09-10 16:24:32 -03:00
where : {
instanceId : instanceId ,
answerId : question.answerId
}
2018-11-12 12:20:44 -02:00
} ) ;
2018-11-27 22:56:11 -02:00
2020-08-19 13:00:21 -03:00
return { question : question , answer : answer } ;
2018-09-10 16:24:32 -03:00
}
2018-11-27 22:56:11 -02:00
2020-08-19 13:00:21 -03:00
return undefined ;
2018-04-21 02:59:30 -03:00
}
2022-11-30 09:40:09 -03:00
public async addAnswer ( obj : GuaribasAnswer ) : Promise < GuaribasAnswer > {
2019-10-16 23:22:21 -03:00
return await GuaribasAnswer . create ( obj ) ;
2018-04-21 02:59:30 -03:00
}
2022-11-30 09:40:09 -03:00
public async ask (
2024-01-29 21:04:53 -03:00
min : GBMinInstance ,
2024-03-11 13:30:11 -03:00
user ,
step ,
2024-05-17 19:19:58 -03:00
pid ,
2018-09-10 16:24:32 -03:00
query : string ,
2018-04-21 02:59:30 -03:00
searchScore : number ,
2018-08-28 19:16:29 -03:00
subjects : GuaribasSubject [ ]
) : Promise < KBServiceSearchResults > {
2018-09-09 14:39:37 -03:00
// Builds search query.
2018-08-28 19:16:29 -03:00
2018-11-12 12:20:44 -02:00
query = query . toLowerCase ( ) ;
query = query . replace ( '?' , ' ' ) ;
query = query . replace ( '!' , ' ' ) ;
query = query . replace ( '.' , ' ' ) ;
query = query . replace ( '/' , ' ' ) ;
query = query . replace ( '\\' , ' ' ) ;
2020-10-30 11:30:03 -03:00
query = query . replace ( '\r\n' , ' ' ) ;
2018-08-28 19:16:29 -03:00
2024-01-29 21:04:53 -03:00
const instance = min . instance ;
2024-02-16 21:26:11 -03:00
const contentLocale = min . core . getParam < string > (
min . instance ,
'Default Content Language' ,
GBConfigService . get ( 'DEFAULT_CONTENT_LANGUAGE' )
) ;
2024-05-17 19:19:58 -03:00
query = await min . conversationalService . translate ( min , query , contentLocale ) ;
2024-04-21 23:39:39 -03:00
GBLogEx . info ( min , ` Translated query (prompt): ${ query } . ` ) ;
2024-02-16 21:26:11 -03:00
2021-04-20 12:01:36 -03:00
// Try simple search first.
const data = await this . getAnswerByText ( instance . instanceId , query . trim ( ) ) ;
if ( data ) {
2024-04-21 23:39:39 -03:00
GBLogEx . info ( min , ` Simple SEARCH called. ` ) ;
2021-04-20 12:01:36 -03:00
return { answer : data.answer , questionId : data.question.questionId } ;
}
2019-03-08 19:13:00 -03:00
if ( subjects !== null ) {
2018-11-12 12:20:44 -02:00
const text = KBService . getSubjectItemsSeparatedBySpaces ( subjects ) ;
2019-03-08 19:13:00 -03:00
if ( text !== null ) {
2018-11-12 12:20:44 -02:00
query = ` ${ query } ${ text } ` ;
2018-05-18 11:39:17 -03:00
}
2018-09-09 14:39:37 -03:00
}
2024-03-06 14:38:37 -03:00
let returnedScore = 0 ;
2024-05-17 19:19:58 -03:00
const key = instance . searchKey ? instance.searchKey : GBServer.globals.minBoot.instance.searchKey ;
const host = instance . searchHost ? instance.searchHost : GBServer.globals.minBoot.instance.searchHost ;
2023-09-10 13:33:20 -03:00
2021-03-31 08:40:51 -03:00
// No direct match found, so Search is used.
2023-09-10 13:33:20 -03:00
if ( key !== null && GBConfigService . get ( 'STORAGE_DIALECT' ) === 'mssql' ) {
2022-11-19 19:50:19 -03:00
interface SearchResults {
instanceId : number ;
questionId : number ;
answerId : number ;
content : string ;
subject1 : string ;
subject2 : string ;
2022-12-15 23:03:20 -03:00
subject3 : string ;
2022-11-19 19:50:19 -03:00
subject4 : string ;
}
2023-09-10 13:39:07 -03:00
const client = new SearchClient < any > ( 'https://' + host , 'azuresql-index' , {
2023-09-10 13:33:20 -03:00
key : key
2022-11-19 19:50:19 -03:00
} as any ) ;
2021-10-12 13:47:46 -03:00
2024-01-29 21:04:53 -03:00
const results = await client . search ( query . substring ( 0 , 499 ) , {
2021-10-12 13:47:46 -03:00
filter : ` instanceId eq ${ instance . instanceId } and skipIndex eq false ` ,
2022-12-15 23:03:20 -03:00
searchFields : [ 'content' , 'subject1' , 'subject2' , 'subject3' , 'subject4' ] ,
2022-11-19 19:50:19 -03:00
select : [ 'instanceId' , 'questionId' , 'answerId' ] ,
2021-10-12 13:47:46 -03:00
skip : 0 ,
2022-11-19 19:50:19 -03:00
top : 1
2021-10-12 13:47:46 -03:00
} ) ;
2020-12-06 16:22:34 -03:00
// Searches via Search (Azure Search).
2022-11-19 19:50:19 -03:00
let found = false ;
2023-02-23 05:47:05 -03:00
for await ( const result of results . results ) {
2022-11-19 19:50:19 -03:00
found = true ;
2023-02-23 05:47:05 -03:00
returnedScore = result . score ;
2020-12-06 16:22:34 -03:00
if ( returnedScore >= searchScore ) {
2022-11-19 19:50:19 -03:00
const value = await this . getAnswerById ( instance . instanceId , result . document . answerId ) ;
2020-12-06 16:22:34 -03:00
if ( value !== null ) {
2024-05-17 19:19:58 -03:00
GBLogEx . info (
min ,
` SEARCH WILL BE USED with score: ${ returnedScore } > required (searchScore): ${ searchScore } `
) ;
2020-12-31 15:36:19 -03:00
2022-11-19 19:50:19 -03:00
return { answer : value , questionId : result.document.questionId } ;
2020-12-06 16:22:34 -03:00
} else {
2024-05-17 19:19:58 -03:00
GBLogEx . info (
min ,
2023-02-27 14:34:38 -03:00
` Index problem. SEARCH WILL NOT be used as answerId ${ result . document . answerId } was not found in database,
2020-12-06 16:22:34 -03:00
returnedScore : $ { returnedScore } < required ( searchScore ) : $ { searchScore } `
) ;
return { answer : undefined , questionId : 0 } ;
}
2018-09-11 19:33:58 -03:00
}
2022-11-19 19:50:19 -03:00
}
2020-12-07 11:21:51 -03:00
}
2024-05-17 19:19:58 -03:00
GBLogEx . info (
min ,
2024-03-11 13:30:11 -03:00
` SEARCH returned LOW level score, calling NLP if any,
2024-03-06 14:38:37 -03:00
returnedScore : $ { returnedScore } < required ( searchScore ) : $ { searchScore } `
) ;
2024-09-06 15:15:42 -03:00
return await ChatServices . answerByLLM ( step . context . activity [ 'pid' ] , min , user , query ) ;
2018-04-21 02:59:30 -03:00
}
2022-11-30 09:40:09 -03:00
public async getSubjectItems ( instanceId : number , parentId : number ) : Promise < GuaribasSubject [ ] > {
2018-11-12 12:20:44 -02:00
const where = { parentSubjectId : parentId , instanceId : instanceId } ;
2018-11-27 22:56:11 -02:00
2018-09-20 12:35:47 -03:00
return GuaribasSubject . findAll ( {
where : where
2018-11-12 12:20:44 -02:00
} ) ;
2018-04-21 02:59:30 -03:00
}
2022-11-30 09:40:09 -03:00
public async getFaqBySubjectArray ( instanceId : number , from : string , subjects : any ) : Promise < GuaribasQuestion [ ] > {
2018-09-20 12:35:47 -03:00
if ( subjects ) {
2019-02-11 11:25:00 -02:00
const where = {
from : from ,
2019-04-07 12:23:27 -03:00
// tslint:disable-next-line: no-null-keyword
subject1 : null ,
// tslint:disable-next-line: no-null-keyword
subject2 : null ,
// tslint:disable-next-line: no-null-keyword
subject3 : null ,
// tslint:disable-next-line: no-null-keyword
2021-03-31 08:40:51 -03:00
subject4 : null ,
2021-02-28 12:26:44 -03:00
// tslint:disable-next-line: no-null-keyword
instanceId : instanceId
2019-02-11 11:25:00 -02:00
} ;
2019-04-07 12:23:27 -03:00
if ( subjects [ 0 ] && subjects [ 0 ] . internalId ) {
2018-11-12 12:20:44 -02:00
where . subject1 = subjects [ 0 ] . internalId ;
2018-09-20 12:35:47 -03:00
}
2018-04-21 02:59:30 -03:00
2019-04-07 12:23:27 -03:00
if ( subjects [ 1 ] && subjects [ 1 ] . internalId ) {
2018-11-12 12:20:44 -02:00
where . subject2 = subjects [ 1 ] . internalId ;
2018-09-20 12:35:47 -03:00
}
2018-04-21 02:59:30 -03:00
2019-04-07 12:23:27 -03:00
if ( subjects [ 2 ] && subjects [ 2 ] . internalId ) {
2018-11-12 12:20:44 -02:00
where . subject3 = subjects [ 2 ] . internalId ;
2018-09-20 12:35:47 -03:00
}
2018-09-09 14:39:37 -03:00
2019-04-07 12:23:27 -03:00
if ( subjects [ 3 ] && subjects [ 3 ] . internalId ) {
2018-11-12 12:20:44 -02:00
where . subject4 = subjects [ 3 ] . internalId ;
2018-09-20 12:35:47 -03:00
}
2018-11-27 22:56:11 -02:00
2019-02-11 11:25:00 -02:00
return await GuaribasQuestion . findAll ( {
where : where
} ) ;
} else {
return await GuaribasQuestion . findAll ( {
2021-02-28 13:06:05 -03:00
where : { from : from , instanceId : instanceId }
2019-02-11 11:25:00 -02:00
} ) ;
}
2018-04-21 02:59:30 -03:00
}
2022-11-30 09:40:09 -03:00
public static async getGroupReplies ( instanceId : number ) : Promise < GuaribasQuestion [ ] > {
2022-06-06 18:03:02 -03:00
return await GuaribasQuestion . findAll ( {
where : { from : 'group' , instanceId : instanceId }
} ) ;
2021-12-25 22:13:52 -03:00
}
2022-11-30 09:40:09 -03:00
public async importKbTabularFile (
2018-09-09 14:39:37 -03:00
filePath : string ,
2023-02-22 13:18:16 -03:00
min : GBMinInstance ,
2018-09-09 14:39:37 -03:00
packageId : number
) : Promise < GuaribasQuestion [ ] > {
2024-09-10 23:25:07 -03:00
GBLogEx . info ( min , ` Now reading file ${ path . basename ( filePath ) } ... ` ) ;
2020-12-31 15:36:19 -03:00
const workbook = new Excel . Workbook ( ) ;
2024-09-10 23:25:07 -03:00
let data ;
if ( filePath . endsWith ( '.xlsx' ) ) {
data = await workbook . xlsx . readFile ( filePath ) ;
} else if ( filePath . endsWith ( '.csv' ) ) {
data = await workbook . csv . readFile ( filePath ) ;
}
2018-09-09 20:09:07 -03:00
2019-02-11 11:25:00 -02:00
let lastQuestionId : number ;
2018-09-20 12:35:47 -03:00
let lastAnswer : GuaribasAnswer ;
2020-08-26 17:50:42 -03:00
// Finds a valid worksheet because Excel returns empty slots
// when loading worksheets collection.
2020-11-17 08:27:10 -03:00
2024-09-10 23:25:07 -03:00
let worksheet = data ;
2024-09-16 19:22:55 -03:00
if ( ! worksheet . _rows ) {
2024-09-10 23:25:07 -03:00
for ( let t = 0 ; t < data . worksheets . length ; t ++ ) {
worksheet = data . worksheets [ t ] ;
if ( worksheet ) {
break ;
}
2020-08-26 17:50:42 -03:00
}
}
2020-05-12 19:20:59 -03:00
2020-12-31 15:36:19 -03:00
const rows = worksheet . _rows ;
const answers = [ ] ;
const questions = [ ] ;
2018-09-09 20:09:07 -03:00
2024-09-16 19:12:58 -03:00
GBLogEx . info ( min , ` Processing ${ rows ? . length } rows from ${ path . basename ( filePath ) } ... ` ) ;
2020-12-07 11:21:51 -03:00
await asyncPromise . eachSeries ( rows , async line = > {
2018-09-09 20:09:07 -03:00
// Skips the first line.
2020-08-26 17:50:42 -03:00
if (
line != undefined &&
line . _cells [ 0 ] !== undefined &&
2020-04-02 19:03:57 -03:00
line . _cells [ 1 ] !== undefined &&
line . _cells [ 2 ] !== undefined &&
line . _cells [ 3 ] !== undefined &&
2020-08-26 17:50:42 -03:00
line . _cells [ 4 ] !== undefined
) {
2020-04-02 19:03:57 -03:00
// Extracts values from columns in the current line.
2018-09-09 20:09:07 -03:00
2020-06-03 16:43:35 -03:00
const subjectsText = line . _cells [ 0 ] . text ;
const from = line . _cells [ 1 ] . text ;
const to = line . _cells [ 2 ] . text ;
2022-08-26 13:57:05 -03:00
const question = line . _cells [ 3 ] . text . trim ( ) ;
let answer = line . _cells [ 4 ] . text . trim ( ) ;
2020-03-30 14:03:12 -03:00
2022-11-19 19:50:19 -03:00
if (
! ( subjectsText === 'subjects' && from === 'from' ) &&
answer !== null &&
question !== null &&
answer !== '' &&
question !== ''
) {
2020-04-02 19:03:57 -03:00
let format = '.txt' ;
2018-09-09 18:11:41 -03:00
2020-04-02 19:03:57 -03:00
// Extracts answer from external media if any.
2018-09-09 20:09:07 -03:00
2020-04-02 19:03:57 -03:00
let media = null ;
2018-09-20 12:35:47 -03:00
2020-08-26 17:50:42 -03:00
if ( typeof answer !== 'string' ) {
2024-04-21 23:39:39 -03:00
GBLogEx . info ( min , ` [GBImporter] Answer is NULL related to Question ' ${ question } '. ` ) ;
2020-08-26 17:50:42 -03:00
answer =
'Existe um problema na base de conhecimento. Fui treinado para entender sua pergunta, avise a quem me criou que a resposta não foi informada para esta pergunta.' ;
2022-07-13 09:38:13 -03:00
} else if ( answer . indexOf ( '.md' ) > - 1 || answer . indexOf ( '.docx' ) > - 1 ) {
2020-04-02 19:03:57 -03:00
const mediaFilename = urlJoin ( path . dirname ( filePath ) , '..' , 'articles' , answer ) ;
2024-09-07 18:13:36 -03:00
if ( await GBUtil . exists ( mediaFilename ) ) {
2022-08-26 16:14:59 -03:00
// Tries to load .docx file from Articles folder.
2022-07-13 09:38:13 -03:00
2022-08-26 16:14:59 -03:00
if ( answer . indexOf ( '.docx' ) > - 1 ) {
answer = await this . getTextFromFile ( filePath ) ;
2022-11-19 19:50:19 -03:00
} else {
2022-08-26 16:14:59 -03:00
// Loads normally markdown file.
2022-08-26 13:04:08 -03:00
2024-09-07 18:13:36 -03:00
answer = await fs . readFile ( mediaFilename , 'utf8' ) ;
2022-08-26 16:14:59 -03:00
}
2020-04-02 19:03:57 -03:00
format = '.md' ;
media = path . basename ( mediaFilename ) ;
} else {
2022-08-26 16:14:59 -03:00
if ( answer . indexOf ( '.md' ) > - 1 ) {
2024-04-21 23:39:39 -03:00
GBLogEx . info ( min , ` [GBImporter] File not found: ${ mediaFilename } . ` ) ;
2022-08-26 16:14:59 -03:00
answer = '' ;
}
2020-04-02 19:03:57 -03:00
}
}
2018-09-10 12:09:48 -03:00
2020-04-02 19:03:57 -03:00
// Processes subjects hierarchy splitting by dots.
const subjectArray = subjectsText . split ( '.' ) ;
let subject1 : string ;
let subject2 : string ;
let subject3 : string ;
let subject4 : string ;
let indexer = 0 ;
subjectArray . forEach ( element = > {
if ( indexer === 0 ) {
subject1 = subjectArray [ indexer ] . substring ( 0 , 63 ) ;
} else if ( indexer === 1 ) {
subject2 = subjectArray [ indexer ] . substring ( 0 , 63 ) ;
} else if ( indexer === 2 ) {
subject3 = subjectArray [ indexer ] . substring ( 0 , 63 ) ;
} else if ( indexer === 3 ) {
subject4 = subjectArray [ indexer ] . substring ( 0 , 63 ) ;
}
indexer ++ ;
} ) ;
2022-08-26 13:04:08 -03:00
// Skips blank answers.
2022-08-26 13:57:05 -03:00
if ( answer && answer . trim ( ) === '' ) {
2022-08-26 13:04:08 -03:00
return false ;
}
2023-02-22 13:18:16 -03:00
// In case of code cell, compiles it and associate with the answer.
2023-02-23 08:11:09 -03:00
answer = GBVMService . normalizeQuotes ( answer ) ;
const isBasic = answer . toLowerCase ( ) . startsWith ( '/basic' ) ;
if ( /TALK\s*\".*\"/gi . test ( answer ) || isBasic ) {
const code = isBasic ? answer . substr ( 6 ) : answer ;
2024-09-07 00:08:23 -03:00
const packagePath = GBUtil . getGBAIPath ( min . botId , ` gbdialog ` ) ;
2023-02-23 08:11:09 -03:00
const scriptName = ` tmp ${ GBAdminService . getRndReadableIdentifier ( ) } .docx ` ;
2024-09-07 00:08:23 -03:00
const localName = path . join ( 'work' , packagePath , ` ${ scriptName } ` ) ;
2024-09-07 18:13:36 -03:00
fs . writeFile ( localName , code , { encoding : null } ) ;
2023-02-23 08:11:09 -03:00
answer = scriptName ;
const vm = new GBVMService ( ) ;
2024-09-06 15:30:03 -03:00
await vm . loadDialog ( path . basename ( localName ) , path . dirname ( localName ) , min ) ;
2023-02-22 13:18:16 -03:00
}
2020-04-02 19:03:57 -03:00
// Now with all the data ready, creates entities in the store.
2020-12-07 11:21:51 -03:00
const answer1 = {
2023-02-22 13:18:16 -03:00
instanceId : min.instance.instanceId ,
2020-04-02 19:03:57 -03:00
content : answer ,
format : format ,
media : media ,
packageId : packageId ,
prevId : lastQuestionId !== null ? lastQuestionId : 0
2020-12-07 11:21:51 -03:00
} ;
2020-04-02 19:03:57 -03:00
2020-12-07 11:21:51 -03:00
answers . push ( answer1 ) ;
const question1 = {
2020-04-02 19:03:57 -03:00
from : from ,
to : to ,
subject1 : subject1 ,
subject2 : subject2 ,
subject3 : subject3 ,
subject4 : subject4 ,
2021-04-27 13:42:47 -03:00
content : question.replace ( /["]+/g , '' ) ,
2023-02-22 13:18:16 -03:00
instanceId : min.instance.instanceId ,
2022-11-19 19:50:19 -03:00
skipIndex : question.charAt ( 0 ) === '"' ,
2020-04-02 19:03:57 -03:00
packageId : packageId
2020-12-07 11:21:51 -03:00
} ;
questions . push ( question1 ) ;
2020-04-02 19:03:57 -03:00
2022-12-26 13:37:11 -03:00
// https://github.com/GeneralBots/BotServer/issues/312
// if (lastAnswer !== undefined && lastQuestionId !== 0) {
2020-12-07 11:21:51 -03:00
// await lastAnswer.update({ nextId: lastQuestionId });
// }
// lastAnswer = answer1;
// lastQuestionId = question1.questionId;
2018-09-09 20:09:07 -03:00
2020-12-07 11:21:51 -03:00
return true ;
2020-04-02 19:03:57 -03:00
} else {
// Skips the header.
2018-09-09 20:09:07 -03:00
2020-08-19 13:00:21 -03:00
return undefined ;
2020-04-02 19:03:57 -03:00
}
2018-09-09 20:09:07 -03:00
}
2018-11-12 12:20:44 -02:00
} ) ;
2020-12-07 11:21:51 -03:00
const answersCreated = await GuaribasAnswer . bulkCreate ( answers ) ;
let i = 0 ;
await CollectionUtil . asyncForEach ( questions , async question = > {
question . answerId = answersCreated [ i ++ ] . answerId ;
} ) ;
2020-12-31 15:36:19 -03:00
2020-12-07 11:21:51 -03:00
return await GuaribasQuestion . bulkCreate ( questions ) ;
2018-04-21 02:59:30 -03:00
}
2024-01-30 19:21:04 -03:00
public async sendAnswer ( min : GBMinInstance , channel : string , step : GBDialogStep , answer ) {
2024-05-17 19:19:58 -03:00
answer = typeof answer === 'string' ? answer : answer.content ;
2024-01-30 19:21:04 -03:00
if ( answer . endsWith ( '.mp4' ) ) {
2020-06-05 16:09:47 -03:00
await this . playVideo ( min , min . conversationalService , step , answer , channel ) ;
2021-04-03 20:13:27 -03:00
} else if (
2024-01-30 19:21:04 -03:00
answer . endsWith ( '.ppt' ) ||
answer . endsWith ( '.pptx' ) ||
answer . endsWith ( '.doc' ) ||
answer . endsWith ( '.docx' ) ||
answer . endsWith ( '.xls' ) ||
answer . endsWith ( '.xlsx' )
2021-04-03 20:13:27 -03:00
) {
2024-09-07 00:08:23 -03:00
const packagePath = GBUtil . getGBAIPath ( min . botId , ` gbkb ` ) ;
const doc = urlJoin ( GBServer . globals . publicAddress , 'kb' , packagePath , 'assets' , answer ) ;
2021-04-03 20:13:27 -03:00
const url = ` http://view.officeapps.live.com/op/view.aspx?src= ${ doc } ` ;
await this . playUrl ( min , min . conversationalService , step , url , channel ) ;
2024-01-30 19:21:04 -03:00
} else if ( answer . endsWith ( '.pdf' ) ) {
2024-09-07 00:08:23 -03:00
const packagePath = GBUtil . getGBAIPath ( min . botId , ` gbkb ` ) ;
const url = urlJoin ( 'kb' , packagePath , 'assets' , answer ) ;
2021-04-03 20:13:27 -03:00
await this . playUrl ( min , min . conversationalService , step , url , channel ) ;
2020-08-26 17:50:42 -03:00
} else if ( answer . format === '.md' ) {
2024-05-17 19:19:58 -03:00
await min . conversationalService [ 'playMarkdown' ] ( min , answer , channel , step , GBMinService . userMobile ( step ) ) ;
2024-01-30 19:21:04 -03:00
} else if ( answer . endsWith ( '.ogg' ) && process . env . AUDIO_DISABLED !== 'true' ) {
2019-08-24 18:46:04 -03:00
await this . playAudio ( min , answer , channel , step , min . conversationalService ) ;
2024-11-08 06:49:30 -03:00
} else if ( answer . startsWith ( '![' ) ) {
2024-11-24 10:25:58 -03:00
// Checks for text after the image markdown, after the element 4, there are text blocks.
2024-11-24 10:03:23 -03:00
2024-11-24 10:25:58 -03:00
const removeMarkdownImages = ( text : string ) = > {
return text . replace ( /!\[[^\]]*\](?:\([^)]*\)|\[[^\]]*\])/g , '' ) . trim ( ) ;
}
2024-11-24 10:03:23 -03:00
2024-11-24 10:25:58 -03:00
if ( removeMarkdownImages ( answer ) ) {
2024-11-24 10:03:23 -03:00
await min . conversationalService . sendText ( min , step , answer ) ;
}
else {
const urlMatch = answer . match ( /!?\[.*?\]\((.*?)\)/ ) ;
const url = urlMatch ? urlMatch [ 1 ] : null ;
await this . showImage ( min , min . conversationalService , step , url , channel )
}
2019-08-24 12:22:52 -03:00
} else {
2024-01-30 19:21:04 -03:00
await min . conversationalService . sendText ( min , step , answer ) ;
2019-08-24 12:22:52 -03:00
}
2019-08-24 18:46:04 -03:00
}
2019-08-24 12:22:52 -03:00
2023-04-09 19:20:15 -03:00
public async addQA ( min , questionText , answerText ) {
const pkg = await GuaribasPackage . findOne ( {
where : { instanceId : min.instance.instanceId }
} ) ;
const question = {
from : 'autodialog' ,
to : '' ,
subject1 : '' ,
subject2 : '' ,
subject3 : '' ,
subject4 : '' ,
content : questionText.replace ( /["]+/g , '' ) ,
instanceId : min.instance.instanceId ,
skipIndex : false ,
packageId : pkg.packageId
} ;
const answer = {
instanceId : min.instance.instanceId ,
content : answerText ,
format : '.txt' ,
media : null ,
packageId : pkg.packageId ,
prevId : 0
} ;
2023-05-25 21:20:40 -03:00
const a = await GuaribasAnswer . create ( answer ) ;
2023-04-09 19:20:15 -03:00
question [ 'answerId' ] = a . answerId ;
const q = await GuaribasQuestion . create ( question ) ;
}
2022-11-30 09:40:09 -03:00
public async importKbPackage (
2021-07-16 08:12:58 -03:00
min : GBMinInstance ,
2019-08-24 18:46:04 -03:00
localPath : string ,
packageStorage : GuaribasPackage ,
instance : IGBInstance
) : Promise < any > {
2024-11-24 10:03:23 -03:00
2019-08-24 18:46:04 -03:00
// Imports subjects tree into database and return it.
2018-09-09 14:39:37 -03:00
2020-05-27 23:01:44 -03:00
const subjectFile = urlJoin ( localPath , 'subjects.json' ) ;
2023-02-23 11:51:59 -03:00
const menuFile = urlJoin ( localPath , 'menu.xlsx' ) ;
2020-05-27 23:01:44 -03:00
2023-09-13 18:39:36 -03:00
// Imports menu.xlsx if any.
2024-09-08 16:48:26 -03:00
if ( ( await GBUtil . exists ( subjectFile ) ) || ( await GBUtil . exists ( menuFile ) ) ) {
2023-02-23 11:51:59 -03:00
await this . importSubjectFile ( packageStorage . packageId , subjectFile , menuFile , instance ) ;
2020-05-27 23:01:44 -03:00
}
2018-09-09 14:39:37 -03:00
2020-04-02 19:03:57 -03:00
// Import tabular files in the tabular directory.
2023-02-22 13:18:16 -03:00
await this . importKbTabularDirectory ( localPath , min , packageStorage . packageId ) ;
2018-04-21 02:59:30 -03:00
2020-04-02 19:03:57 -03:00
// Import remaining .md files in articles directory.
2024-08-04 17:16:04 -03:00
await this . importRemainingArticles ( min , localPath , instance , packageStorage . packageId ) ;
2021-07-16 08:12:58 -03:00
// Import docs files in .docx directory.
return await this . importDocs ( min , localPath , instance , packageStorage . packageId ) ;
2019-08-24 18:46:04 -03:00
}
2018-09-09 18:11:41 -03:00
2020-04-02 19:03:57 -03:00
/ * *
2021-07-16 08:12:58 -03:00
* Import all . md files in articles folder that has not been referenced by tabular files .
2020-04-02 19:03:57 -03:00
* /
2024-08-04 17:16:04 -03:00
public async importRemainingArticles (
min : GBMinInstance ,
localPath : string ,
instance : IGBInstance ,
packageId : number
) : Promise < any > {
2020-04-02 19:03:57 -03:00
const files = await walkPromise ( urlJoin ( localPath , 'articles' ) ) ;
2023-02-22 11:30:51 -03:00
const data = { questions : [ ] , answers : [ ] } ;
2020-04-02 19:03:57 -03:00
2020-05-27 23:01:44 -03:00
await CollectionUtil . asyncForEach ( files , async file = > {
if ( file !== null && file . name . endsWith ( '.md' ) ) {
let content = await this . getAnswerTextByMediaName ( instance . instanceId , file . name ) ;
2020-04-02 19:03:57 -03:00
2020-05-27 23:01:44 -03:00
if ( content === null ) {
const fullFilename = urlJoin ( file . root , file . name ) ;
2024-09-07 18:13:36 -03:00
content = await fs . readFile ( fullFilename , 'utf-8' ) ;
2020-04-02 19:03:57 -03:00
2022-01-03 13:11:21 -03:00
await GuaribasAnswer . create ( < GuaribasAnswer > {
2020-05-27 23:01:44 -03:00
instanceId : instance.instanceId ,
content : content ,
2020-08-26 17:50:42 -03:00
format : '.md' ,
2020-05-27 23:01:44 -03:00
media : file.name ,
packageId : packageId ,
2022-12-26 13:37:11 -03:00
prevId : 0 // https://github.com/GeneralBots/BotServer/issues/312
2020-05-27 23:01:44 -03:00
} ) ;
2020-04-02 19:03:57 -03:00
}
2023-02-21 18:21:36 -03:00
} else if ( file !== null && file . name . endsWith ( '.docx' ) ) {
2024-09-07 00:08:23 -03:00
let packagePath = GBUtil . getGBAIPath ( instance . botId , ` gbkb ` ) ;
const localName = path . join ( 'work' , packagePath , 'articles' , file . name ) ;
2024-04-28 16:17:00 -03:00
let loader = new DocxLoader ( localName ) ;
let doc = await loader . load ( ) ;
2024-08-04 17:16:04 -03:00
let content = doc [ 0 ] . pageContent ;
2024-05-17 19:19:58 -03:00
2024-08-19 17:09:23 -03:00
if ( file . name . endsWith ( 'zap.docx' ) ) {
2024-08-04 17:16:04 -03:00
await min . whatsAppDirectLine . createOrUpdateTemplate ( min , file . name , content ) ;
}
2024-08-19 17:09:23 -03:00
const answer = {
2024-04-28 16:17:00 -03:00
instanceId : instance.instanceId ,
2024-08-04 17:16:04 -03:00
content : content ,
2024-04-28 16:17:00 -03:00
format : '.md' ,
media : file.name ,
packageId : packageId ,
prevId : 0
} ;
data . answers . push ( answer ) ;
} else if ( file !== null && file . name . endsWith ( '.toc.docx' ) ) {
2024-09-07 00:08:23 -03:00
const packagePath = GBUtil . getGBAIPath ( instance . botId , ` gbkb ` ) ;
const localName = path . join ( 'work' , packagePath , 'articles' , file . name ) ;
2024-09-07 18:13:36 -03:00
const buffer = await fs . readFile ( localName , { encoding : null } ) ;
2023-02-21 18:21:36 -03:00
var options = {
buffer : buffer ,
2023-02-22 11:30:51 -03:00
convertImage : async image = > {
2024-09-06 15:30:03 -03:00
const localName = path . join (
2023-02-21 18:21:36 -03:00
'work' ,
2024-09-06 15:15:42 -03:00
GBUtil . getGBAIPath ( instance . botId ) ,
2023-02-21 18:21:36 -03:00
'cache' ,
` img-docx ${ GBAdminService . getRndReadableIdentifier ( ) } .png `
) ;
2023-04-09 19:20:15 -03:00
const url = urlJoin (
GBServer . globals . publicAddress ,
2024-09-06 15:15:42 -03:00
GBUtil . getGBAIPath ( instance . botId ) . replace ( /\.[^/.]+$/ , '' ) ,
2023-04-09 19:20:15 -03:00
'cache' ,
2024-09-06 15:30:03 -03:00
path . basename ( localName )
2023-04-09 19:20:15 -03:00
) ;
2023-02-21 18:21:36 -03:00
const buffer = await image . read ( ) ;
2024-09-10 23:25:07 -03:00
await fs . writeFile ( localName , buffer , { encoding : null } ) ;
2023-02-21 18:21:36 -03:00
return { src : url } ;
2023-02-22 11:30:51 -03:00
}
2023-02-21 18:21:36 -03:00
} ;
2023-02-22 11:30:51 -03:00
let state = 0 ;
let previousState = state ;
const next = ( root , el , data ) = > {
// If it is root, change to the first item.
if ( el . parentNode == null ) {
el = el . firstChild ;
}
let value = el . innerHTML ;
const isHeader = el = > el . rawTagName . startsWith ( 'h' ) && el . rawTagName . length === 2 ;
// Handle questions from H* elements.
if ( state === 0 ) {
const question = {
from : 'document' ,
to : '' ,
subject1 : '' ,
subject2 : '' ,
subject3 : '' ,
subject4 : '' ,
content : value.replace ( /["]+/g , '' ) ,
instanceId : instance.instanceId ,
skipIndex : 0 ,
packageId : packageId
} ;
data . questions . push ( question ) ;
previousState = state ;
state = 1 ;
// Everything else is content for that Header.
} else if ( state === 1 ) {
// If next element is null, the tree has been passed, so
// finish the append of other elements between the last Header
// and the end of the document.
if ( ! el . nextSibling || isHeader ( el . nextSibling ) ) {
const answer = {
instanceId : instance.instanceId ,
content : value ,
format : '.html' ,
media : file.name ,
packageId : packageId ,
prevId : 0
} ;
data . answers . push ( answer ) ;
state = 0 ;
2023-02-23 08:11:09 -03:00
// Otherwise, just append content to insert later.
2023-02-22 11:30:51 -03:00
} else {
value += value ;
}
}
// Goes to the next node, as it is all same level nodes.
if ( el . nextSibling ) {
next ( root , el . nextSibling , data ) ;
}
} ;
const html = await mammoth . convertToHtml ( options ) ;
const root = parse ( html . value ) ;
next ( root , root , data ) ;
2020-05-27 23:01:44 -03:00
}
2023-02-22 11:30:51 -03:00
// Persist to storage.
const answersCreated = await GuaribasAnswer . bulkCreate ( data . answers ) ;
let i = 0 ;
await CollectionUtil . asyncForEach ( data . questions , async question = > {
question . answerId = answersCreated [ i ++ ] . answerId ;
} ) ;
return await GuaribasQuestion . bulkCreate ( data . questions ) ;
2020-05-27 23:01:44 -03:00
} ) ;
2020-04-02 19:03:57 -03:00
}
2021-01-20 18:23:42 -03:00
2024-05-23 14:11:33 -03:00
async crawl (
min ,
url : string ,
visited : Set < string > ,
depth : number ,
maxDepth : number ,
page : Page ,
websiteIgnoreUrls
) : Promise < string [ ] > {
2024-05-17 19:19:58 -03:00
try {
if (
2024-11-08 06:49:30 -03:00
( depth > maxDepth && ! url . endsWith ( 'pdf' ) ) ||
2024-05-21 13:17:42 -03:00
visited . has ( url ) ||
url . endsWith ( '.jpg' ) ||
url . endsWith ( '.png' ) ||
url . endsWith ( '.mp4' )
2024-05-17 19:19:58 -03:00
) {
return [ ] ;
}
2024-09-08 16:48:26 -03:00
await GBLogEx . info ( min , ` Crawling: ${ url } . ` ) ;
2024-05-17 19:19:58 -03:00
visited . add ( url ) ;
2024-09-08 16:48:26 -03:00
2024-09-07 00:08:23 -03:00
const packagePath = GBUtil . getGBAIPath ( min . botId , ` gbot ` ) ;
const directoryPath = path . join ( process . env . PWD , 'work' , packagePath , 'Website' ) ;
2024-09-08 16:48:26 -03:00
const filename = await KBService . savePage ( min , url , page , directoryPath ) ;
2024-05-17 19:19:58 -03:00
if ( ! filename ) {
2024-09-06 15:15:42 -03:00
// If the URL doesn't represent an HTML/PDF page, skip crawling its links
2024-05-17 19:19:58 -03:00
return [ ] ;
}
2024-05-24 14:50:05 -03:00
const currentDomain = new URL ( page . url ( ) ) . hostname ;
2024-05-22 13:21:29 -03:00
2024-05-23 14:11:33 -03:00
let links = await page . evaluate (
( { currentDomain , websiteIgnoreUrls } ) = > {
const anchors = Array . from ( document . querySelectorAll ( 'a' ) ) . filter ( p = > {
try {
2024-05-29 12:42:31 -03:00
// Check if urlToCheck contains any of the ignored URLs
2024-05-30 21:05:58 -03:00
var isIgnored = false ;
2024-08-04 17:16:04 -03:00
if ( websiteIgnoreUrls ) {
2024-05-30 21:05:58 -03:00
websiteIgnoreUrls . split ( ';' ) . some ( ignoredUrl = > p . href . includes ( ignoredUrl ) ) ;
}
2024-05-29 12:42:31 -03:00
return ! isIgnored && currentDomain == new URL ( p . href ) . hostname ;
2024-09-15 16:30:03 -03:00
} catch ( error ) {
2024-05-23 14:11:33 -03:00
return false ;
}
} ) ;
2024-05-17 19:19:58 -03:00
2024-05-23 14:11:33 -03:00
return anchors . map ( anchor = > {
return anchor . href . replace ( /#.*/ , '' ) ;
} ) ;
} ,
{ currentDomain , websiteIgnoreUrls }
) ;
2024-05-17 19:19:58 -03:00
if ( ! Array . isArray ( links ) ) {
links = [ ] ;
}
let filteredLinks = [ ] ;
if ( links && typeof links [ Symbol . iterator ] === 'function' ) {
filteredLinks = links . filter ( l = > {
try {
new URL ( l ) ; // Check if the link is a valid URL
return ! visited . has ( l ) ;
} catch ( error ) {
// Ignore invalid URLs
return false ;
}
} ) ;
}
const childLinks = [ ] ;
for ( const link of filteredLinks ) {
2024-05-22 13:23:36 -03:00
const links = await this . crawl ( min , link , visited , depth + 1 , maxDepth , page , websiteIgnoreUrls ) ;
2024-05-21 13:17:42 -03:00
if ( links ) {
2024-05-17 19:19:58 -03:00
childLinks . push ( . . . links ) ;
}
}
return [ filename , . . . childLinks ] ; // Include the filename of the cached file
} catch ( error ) {
await GBLogEx . info ( min , error ) ;
2024-05-21 13:17:42 -03:00
return [ ] ; // Include the filename of the cached file
2024-05-17 19:19:58 -03:00
}
}
2024-08-04 17:16:04 -03:00
async getLogoByPage ( min , page ) {
2024-05-21 13:17:42 -03:00
const checkPossibilities = async ( page , possibilities ) = > {
2024-08-04 17:16:04 -03:00
try {
for ( const possibility of possibilities ) {
const { tag , attributes } = possibility ;
for ( const attribute of attributes ) {
const selector = ` ${ tag } [ ${ attribute } *="logo"] ` ;
const elements = await page . $ $ ( selector ) ;
for ( const element of elements ) {
const src = await page . evaluate ( el = > el . getAttribute ( 'src' ) , element ) ;
if ( src ) {
return src . split ( '?' ) [ 0 ] ;
}
2024-05-21 13:17:42 -03:00
}
}
}
2024-08-04 17:16:04 -03:00
} catch ( error ) {
await GBLogEx . info ( min , error ) ;
2024-05-21 13:17:42 -03:00
}
return null ;
} ;
// Array of possibilities to check for the logo
const possibilities = [
{ tag : 'img' , attributes : [ 'src' , 'alt' , 'class' ] } , // Check for img elements with specific attributes
{ tag : 'svg' , attributes : [ 'class' , 'aria-label' ] } // Check for svg elements with specific attributes
// Add more possibilities as needed
] ;
return await checkPossibilities ( page , possibilities ) ;
}
2024-05-21 20:27:24 -03:00
async getFreshPage ( browser , url ) {
2024-05-21 18:11:33 -03:00
try {
if ( ! browser || browser . isConnected ( ) === false ) {
browser = await puppeteer . launch ( { headless : false } ) ; // Change headless to true if you don't want to see the browser window
}
const page = await browser . newPage ( ) ;
await page . goto ( url ) ;
return page ;
} catch ( error ) {
console . error ( 'An error occurred while getting fresh page:' , error ) ;
throw error ;
}
}
2021-07-16 08:12:58 -03:00
/ * *
* Import all . docx files in reading comprehension folder .
* /
2022-11-30 09:40:09 -03:00
public async importDocs (
2022-11-19 19:50:19 -03:00
min : GBMinInstance ,
localPath : string ,
instance : IGBInstance ,
packageId : number
) : Promise < any > {
2024-05-17 19:19:58 -03:00
let files = [ ] ;
2024-05-23 14:42:04 -03:00
let website = min . core . getParam < string > ( min . instance , 'Website' , null ) ;
2024-08-04 17:16:04 -03:00
const maxDepth = min . core . getParam < number > ( min . instance , 'Website Depth' , 1 ) ;
2024-05-30 21:05:58 -03:00
const websiteIgnoreUrls = min . core . getParam < [ ] > ( min . instance , 'Website Ignore URLs' , null ) ;
2024-09-26 13:29:42 -03:00
GBLogEx . info ( min , ` Website: ${ website } , Max Depth: ${ maxDepth } , Ignore URLs: ${ websiteIgnoreUrls } ` ) ;
2024-05-22 08:42:17 -03:00
2024-11-24 12:43:58 -03:00
let shouldSave = false ;
2024-05-24 14:52:46 -03:00
if ( website ) {
2024-05-23 14:42:04 -03:00
// Removes last slash if any.
2024-08-04 17:16:04 -03:00
website . endsWith ( '/' ) ? website . substring ( 0 , website . length - 1 ) : website ;
2024-05-23 14:42:04 -03:00
2024-09-07 00:08:23 -03:00
let packagePath = GBUtil . getGBAIPath ( min . botId , ` gbot ` ) ;
const directoryPath = path . join ( process . env . PWD , 'work' , packagePath , 'Website' ) ;
2024-09-07 18:13:36 -03:00
fs . rm ( directoryPath , { recursive : true , force : true } ) ;
2024-05-23 14:11:33 -03:00
2024-05-21 20:27:24 -03:00
let browser = await puppeteer . launch ( { headless : false } ) ;
2024-05-21 18:11:33 -03:00
const page = await this . getFreshPage ( browser , website ) ;
2024-05-23 14:11:33 -03:00
2024-08-04 17:16:04 -03:00
let logo = await this . getLogoByPage ( min , page ) ;
2024-05-23 14:11:33 -03:00
if ( logo ) {
2024-09-07 00:08:23 -03:00
packagePath = GBUtil . getGBAIPath ( min . botId ) ;
2024-09-06 15:15:42 -03:00
2024-05-22 19:09:34 -03:00
const baseUrl = page . url ( ) . split ( '/' ) . slice ( 0 , 3 ) . join ( '/' ) ;
2024-05-23 14:11:33 -03:00
logo = logo . startsWith ( 'https' ) ? logo : urlJoin ( baseUrl , logo ) ;
2024-08-04 17:16:04 -03:00
2024-11-26 09:35:15 -03:00
const logoBinary = await page . goto ( logo ) ;
const buffer = await logoBinary . buffer ( ) ;
const logoFilename = path . basename ( logo ) ;
// Replace sharp with jimp
const image = await Jimp . read ( buffer ) ;
await image . scaleToFit ( { w :48 , h :48 } ) ;
packagePath = path . join ( process . env . PWD , 'work' , packagePath ) ;
const logoPath = path . join ( packagePath , 'cache' , logoFilename ) ;
await ( image as any ) . write ( logoPath ) ;
await min . core [ 'setConfig' ] ( min , 'Logo' , logoFilename ) ;
2024-05-22 19:09:34 -03:00
}
2024-05-17 19:19:58 -03:00
// Extract dominant colors from the screenshot
2024-05-21 13:17:42 -03:00
await page . screenshot ( { path : 'screenshot.png' } ) ;
const colors = await getColors ( 'screenshot.png' ) ;
await min . core [ 'setConfig' ] ( min , 'Color1' , colors [ 0 ] . hex ( ) ) ;
await min . core [ 'setConfig' ] ( min , 'Color2' , colors [ 1 ] . hex ( ) ) ;
2024-05-17 19:19:58 -03:00
2024-05-21 13:54:52 -03:00
// Disables images in crawling.
await page . setRequestInterception ( true ) ;
page . on ( 'request' , req = > {
if ( req . resourceType ( ) === 'image' || req . resourceType ( ) === 'stylesheet' ) {
req . abort ( ) ;
} else {
req . continue ( ) ;
}
} ) ;
2024-05-22 11:14:10 -03:00
page . on ( 'dialog' , async dialog = > {
await dialog . dismiss ( ) ;
} ) ;
2024-05-23 14:11:33 -03:00
2024-05-22 11:14:10 -03:00
page . setCacheEnabled ( false ) ;
2024-05-22 13:21:29 -03:00
2024-05-17 19:19:58 -03:00
const visited = new Set < string > ( ) ;
2024-05-22 13:21:29 -03:00
files = files . concat ( await this . crawl ( min , website , visited , 0 , maxDepth , page , websiteIgnoreUrls ) ) ;
2024-05-21 13:17:42 -03:00
2024-05-17 19:19:58 -03:00
await browser . close ( ) ;
2024-05-21 13:17:42 -03:00
2024-05-17 19:19:58 -03:00
files . shift ( ) ;
2024-09-08 16:48:26 -03:00
GBLogEx . info ( min , ` Vectorizing ${ files . length } file(s)... ` ) ;
2024-11-13 10:38:59 -03:00
if ( await GBUtil . exists ( min [ 'vectorStorePath' ] ) ) {
GBLogEx . info ( min , ` Cleaning vector store: ${ min [ 'vectorStorePath' ] } ... ` )
const gbkbPath = GBUtil . getGBAIPath ( min . botId , 'gbkb' ) ;
min [ 'vectorStorePath' ] = path . join ( 'work' , gbkbPath , 'docs-vectorized' ) ;
2024-11-14 07:32:43 -03:00
min [ 'vectorStore' ] = await min . deployService [ 'loadOrCreateEmptyVectorStore' ] ( min ) ;
2024-11-13 10:38:59 -03:00
}
2024-05-17 19:19:58 -03:00
await CollectionUtil . asyncForEach ( files , async file = > {
let content = null ;
2024-11-24 12:43:58 -03:00
shouldSave = true ;
2024-05-17 19:19:58 -03:00
2024-09-08 16:48:26 -03:00
try {
const document = await this . loadAndSplitFile ( file ) ;
const flattenedDocuments = document . reduce ( ( acc , val ) = > acc . concat ( val ) , [ ] ) ;
2024-11-08 11:49:12 -03:00
await min [ 'vectorStore' ] . addDocuments ( flattenedDocuments ) ;
2024-09-08 16:48:26 -03:00
} catch ( error ) {
GBLogEx . info ( min , ` Ignore processing of ${ file } . ${ GBUtil . toYAML ( error ) } ` ) ;
}
2024-05-17 19:19:58 -03:00
} ) ;
2024-11-08 06:49:30 -03:00
2024-05-17 19:19:58 -03:00
}
files = await walkPromise ( urlJoin ( localPath , 'docs' ) ) ;
2024-11-24 10:03:23 -03:00
2024-09-10 23:25:07 -03:00
if ( files [ 0 ] ) {
2024-11-24 12:43:58 -03:00
shouldSave = true ;
2024-11-24 13:07:16 -03:00
GBLogEx . info ( min , ` Add embeddings from .gbkb: ${ files . length } files being processed... ` ) ;
2021-07-29 09:50:38 -03:00
await CollectionUtil . asyncForEach ( files , async file = > {
let content = null ;
2024-09-06 15:30:03 -03:00
let filePath = path . join ( file . root , file . name ) ;
2021-07-16 08:12:58 -03:00
2024-01-29 21:04:53 -03:00
const document = await this . loadAndSplitFile ( filePath ) ;
const flattenedDocuments = document . reduce ( ( acc , val ) = > acc . concat ( val ) , [ ] ) ;
2024-11-08 11:49:12 -03:00
await min [ 'vectorStore' ] . addDocuments ( flattenedDocuments ) ;
2021-07-29 09:50:38 -03:00
} ) ;
}
2024-11-24 12:43:58 -03:00
if ( shouldSave && min [ 'vectorStore' ] ) {
2024-11-24 10:03:23 -03:00
await min [ 'vectorStore' ] . save ( min [ 'vectorStorePath' ] ) ;
}
2021-07-16 08:12:58 -03:00
}
2024-11-08 06:49:30 -03:00
2024-05-17 19:19:58 -03:00
defaultRecursiveCharacterTextSplitter = new RecursiveCharacterTextSplitter ( {
2024-01-30 19:21:04 -03:00
chunkSize : 700 ,
2024-05-17 19:19:58 -03:00
chunkOverlap : 50
2024-01-29 21:04:53 -03:00
} ) ;
2024-05-17 19:19:58 -03:00
markdownRecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter . fromLanguage ( 'markdown' , {
2024-01-30 19:21:04 -03:00
chunkSize : 700 ,
2024-05-17 19:19:58 -03:00
chunkOverlap : 50
2024-01-29 21:04:53 -03:00
} ) ;
private async loadAndSplitFile ( filePath : string ) : Promise < Document < Record < string , unknown > > [ ] > {
2024-05-17 19:19:58 -03:00
const fileExtension = path . extname ( filePath ) ;
let loader ;
let documents : Document < Record < string , unknown > > [ ] ;
switch ( fileExtension ) {
case '.json' :
loader = new JSONLoader ( filePath ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
case '.txt' :
loader = new TextLoader ( filePath ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
case '.txt' :
loader = new TextLoader ( filePath ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
case '.html' :
loader = new TextLoader ( filePath ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
case '.md' :
loader = new TextLoader ( filePath ) ;
documents = await loader . loadAndSplit ( this . markdownRecursiveCharacterTextSplitter ) ;
break ;
case '.pdf' :
loader = new PDFLoader ( filePath , { splitPages : false } ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
case '.docx' :
loader = new DocxLoader ( filePath ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
case '.csv' :
loader = new CSVLoader ( filePath ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
case '.epub' :
loader = new EPubLoader ( filePath , { splitChapters : false } ) ;
documents = await loader . loadAndSplit ( this . defaultRecursiveCharacterTextSplitter ) ;
break ;
default :
throw new Error ( ` Unsupported file extension: ${ fileExtension } ` ) ;
}
return documents ;
2019-08-24 18:46:04 -03:00
}
2024-01-29 21:04:53 -03:00
2024-05-17 19:19:58 -03:00
public async importKbTabularDirectory ( localPath : string , min : GBMinInstance , packageId : number ) : Promise < any > {
const files = await walkPromise ( localPath ) ;
2024-01-29 21:04:53 -03:00
2024-05-17 19:19:58 -03:00
await CollectionUtil . asyncForEach ( files , async file = > {
2024-09-10 23:25:07 -03:00
if ( file !== null && ( file . name . endsWith ( '.xlsx' ) || file . name . endsWith ( '.csv' ) ) ) {
2024-05-17 19:19:58 -03:00
return await this . importKbTabularFile ( urlJoin ( file . root , file . name ) , min , packageId ) ;
}
} ) ;
}
2018-09-09 20:09:07 -03:00
2023-02-23 11:51:59 -03:00
public async importSubjectFile (
2024-05-17 19:19:58 -03:00
packageId : number ,
filename : string ,
menuFile : string ,
instance : IGBInstance
) : Promise < any > {
let subjectsLoaded ;
2024-09-07 18:13:36 -03:00
if ( await GBUtil . exists ( menuFile ) ) {
2024-05-17 19:19:58 -03:00
// Loads menu.xlsx and finds worksheet.
const workbook = new Excel . Workbook ( ) ;
const data = await workbook . xlsx . readFile ( menuFile ) ;
let worksheet : any ;
for ( let t = 0 ; t < data . worksheets . length ; t ++ ) {
worksheet = data . worksheets [ t ] ;
if ( worksheet ) {
break ;
}
2023-02-23 11:51:59 -03:00
}
2024-05-17 19:19:58 -03:00
const MAX_LEVEL = 4 ; // Max column level to reach menu items in plan.
// Iterates over all items.
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
let rows = worksheet . _rows ;
rows . length = 24 ;
let lastLevel = 0 ;
let subjects = { children : [ ] } ;
let childrenNode = subjects . children ;
let activeObj = null ;
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
let activeChildrenGivenLevel = [ childrenNode ] ;
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
await asyncPromise . eachSeries ( rows , async row = > {
if ( ! row ) return ;
let menu ;
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
// Detect menu level by skipping blank cells on left.
2023-02-27 19:17:52 -03:00
2024-05-17 19:19:58 -03:00
let level ;
for ( level = 0 ; level < MAX_LEVEL ; level ++ ) {
const cell = row . _cells [ level ] ;
if ( cell && cell . text ) {
menu = cell . text ;
break ;
}
2023-02-23 11:51:59 -03:00
}
2024-05-17 19:19:58 -03:00
// Tree hierarchy calculation.
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
if ( level > lastLevel ) {
childrenNode = activeObj . children ;
} else if ( level < lastLevel ) {
childrenNode = activeChildrenGivenLevel [ level ] ;
}
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
/// Keeps the record of last subroots for each level, to
// changel levels greater than one (return to main menu),
// can exists between leaf nodes and roots.
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
activeChildrenGivenLevel [ level ] = childrenNode ;
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
// Insert the object into JSON.
const description = row . _cells [ level + 1 ] ? row . _cells [ level + 1 ] . text : null ;
activeObj = {
title : menu ,
description : description ,
id : menu ,
children : [ ]
} ;
activeChildrenGivenLevel [ level ] . push ( activeObj ) ;
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
lastLevel = level ;
} ) ;
2023-02-23 11:51:59 -03:00
2024-05-17 19:19:58 -03:00
subjectsLoaded = subjects ;
} else {
2024-09-07 18:13:36 -03:00
subjectsLoaded = JSON . parse ( await fs . readFile ( filename , 'utf8' ) ) ;
2024-05-17 19:19:58 -03:00
}
2019-08-24 18:46:04 -03:00
const doIt = async ( subjects : GuaribasSubject [ ] , parentSubjectId : number ) = > {
2024-05-17 19:19:58 -03:00
return asyncPromise . eachSeries ( subjects , async item = > {
const value = await GuaribasSubject . create ( < GuaribasSubject > {
internalId : item.id ,
parentSubjectId : parentSubjectId ,
instanceId : instance.instanceId ,
from : item . from ,
to : item.to ,
title : item.title ,
description : item.description ,
packageId : packageId
} ) ;
if ( item . children ) {
return doIt ( item . children , value . subjectId ) ;
} else {
return item ;
}
2019-08-24 18:46:04 -03:00
} ) ;
2024-05-17 19:19:58 -03:00
} ;
2018-09-10 16:24:32 -03:00
2024-05-17 19:19:58 -03:00
return doIt ( subjectsLoaded . children , undefined ) ;
}
2019-08-24 12:22:52 -03:00
2022-11-30 09:40:09 -03:00
public async undeployKbFromStorage ( instance : IGBInstance , deployer : GBDeployer , packageId : number ) {
2024-05-17 19:19:58 -03:00
await GuaribasQuestion . destroy ( {
where : { instanceId : instance.instanceId , packageId : packageId }
} ) ;
await GuaribasAnswer . destroy ( {
where : { instanceId : instance.instanceId , packageId : packageId }
} ) ;
await GuaribasSubject . destroy ( {
where : { instanceId : instance.instanceId , packageId : packageId }
} ) ;
await this . undeployPackageFromStorage ( instance , packageId ) ;
}
2018-04-21 02:59:30 -03:00
2022-11-30 09:40:09 -03:00
public static async RefreshNER ( min : GBMinInstance ) {
2024-05-17 19:19:58 -03:00
const questions = await KBService . getQuestionsNER ( min . instance . instanceId ) ;
const contentLocale = min . core . getParam < string > (
min . instance ,
'Default Content Language' ,
GBConfigService . get ( 'DEFAULT_CONTENT_LANGUAGE' )
) ;
2022-06-06 18:03:02 -03:00
2024-05-17 19:19:58 -03:00
await CollectionUtil . asyncForEach ( questions , async question = > {
const text = question . content ;
2022-06-06 18:03:02 -03:00
2024-05-17 19:19:58 -03:00
const categoryReg = /.*\((.*)\).*/gi . exec ( text ) ;
const nameReg = /(\w+)\(.*\).*/gi . exec ( text ) ;
2022-06-06 18:03:02 -03:00
2024-05-17 19:19:58 -03:00
if ( categoryReg ) {
let category = categoryReg [ 1 ] ;
2023-02-27 19:17:52 -03:00
2024-05-17 19:19:58 -03:00
if ( category === 'number' ) {
min [ 'nerEngine' ] . addRegexEntity ( 'number' , 'pt' , '/d+/gi' ) ;
}
if ( nameReg ) {
let name = nameReg [ 1 ] ;
2023-02-27 19:17:52 -03:00
2024-05-17 19:19:58 -03:00
min [ 'nerEngine' ] . addNamedEntityText ( category , name , [ contentLocale ] , [ name ] ) ;
}
2022-06-12 18:43:19 -03:00
}
2024-05-17 19:19:58 -03:00
} ) ;
}
2022-06-06 18:03:02 -03:00
2018-04-21 02:59:30 -03:00
/ * *
2019-02-11 11:25:00 -02:00
* Deploys a knowledge base to the storage using the . gbkb format .
*
* @param localPath Path to the . gbkb folder .
* /
2022-11-30 09:40:09 -03:00
public async deployKb ( core : IGBCoreService , deployer : GBDeployer , localPath : string , min : GBMinInstance ) {
2024-09-06 15:30:03 -03:00
const packageName = path . basename ( localPath ) ;
2024-05-17 19:19:58 -03:00
const instance = await core . loadInstanceByBotId ( min . botId ) ;
2024-09-24 16:44:03 -03:00
GBLogEx . info ( min , ` Publishing: ${ path . basename ( localPath ) } ... ` ) ;
2024-05-17 19:19:58 -03:00
const p = await deployer . deployPackageToStorage ( instance . instanceId , packageName ) ;
await this . importKbPackage ( min , localPath , p , instance ) ;
GBDeployer . mountGBKBAssets ( packageName , min . botId , localPath ) ;
2024-08-20 15:13:43 -03:00
if ( GBConfigService . get ( 'STORAGE_NAME' ) ) {
2024-08-19 17:09:23 -03:00
const service = await AzureDeployerService . createInstance ( deployer ) ;
const searchIndex = instance . searchIndex ? instance.searchIndex : GBServer.globals.minBoot.instance.searchIndex ;
await deployer . rebuildIndex ( instance , service . getKBSearchSchema ( searchIndex ) ) ;
}
2024-05-17 19:19:58 -03:00
min [ 'groupCache' ] = await KBService . getGroupReplies ( instance . instanceId ) ;
await KBService . RefreshNER ( min ) ;
2024-09-10 23:25:07 -03:00
const ssr = min . core . getParam < boolean > ( min . instance , 'SSR' , false ) ;
2024-05-17 19:19:58 -03:00
2024-09-10 23:25:07 -03:00
if ( ssr ) {
GBLogEx . info ( min , ` Start Bot Server Side Rendering... ${ localPath } ` ) ;
const html = await GBSSR . getHTML ( min ) ;
let packagePath = GBUtil . getGBAIPath ( min . botId , ` gbui ` ) ;
packagePath = path . join ( process . env . PWD , 'work' , packagePath , 'index.html' ) ;
GBLogEx . info ( min , ` Saving SSR HTML in ${ packagePath } . ` ) ;
await fs . writeFile ( packagePath , html , 'utf8' ) ;
}
2024-09-11 00:33:17 -03:00
GBLogEx . info ( min , ` Done publishing of: ${ path . basename ( localPath ) } . ` ) ;
2024-05-17 19:19:58 -03:00
}
2020-12-31 15:36:19 -03:00
2022-11-30 09:40:09 -03:00
private async playAudio (
2024-05-17 19:19:58 -03:00
min : GBMinInstance ,
answer : GuaribasAnswer ,
channel : string ,
step : GBDialogStep ,
conversationalService : IGBConversationalService
) {
conversationalService . sendAudio ( min , step , answer . content ) ;
}
2024-09-19 17:46:43 -03:00
public async showImage (
min ,
conversationalService : IGBConversationalService ,
step : GBDialogStep ,
url : string ,
channel : string
) {
if ( channel === 'whatsapp' ) {
await min . conversationalService . sendFile ( min , step , null , url , '' ) ;
} else {
await conversationalService . sendEvent ( min , step , 'play' , {
playerType : 'image' ,
data : url
} ) ;
}
}
2024-09-19 09:17:30 -03:00
public async playUrl (
2024-05-17 19:19:58 -03:00
min ,
conversationalService : IGBConversationalService ,
step : GBDialogStep ,
url : string ,
channel : string
) {
if ( channel === 'whatsapp' ) {
await min . conversationalService . sendFile ( min , step , null , url , '' ) ;
} else {
await conversationalService . sendEvent ( min , step , 'play' , {
playerType : 'url' ,
data : url
} ) ;
}
2021-04-03 12:41:44 -03:00
}
2022-11-30 09:40:09 -03:00
private async playVideo (
2024-05-17 19:19:58 -03:00
min ,
conversationalService : IGBConversationalService ,
step : GBDialogStep ,
answer : GuaribasAnswer ,
channel : string
) {
if ( channel === 'whatsapp' ) {
await min . conversationalService . sendFile ( min , step , null , answer . content , '' ) ;
} else {
2024-09-07 00:08:23 -03:00
const packagePath = GBUtil . getGBAIPath ( min . botId , ` gbkb ` ) ;
2024-05-17 19:19:58 -03:00
await conversationalService . sendEvent ( min , step , 'play' , {
playerType : 'video' ,
2024-09-07 00:08:23 -03:00
data : urlJoin ( packagePath , 'videos' , answer . content )
2024-05-17 19:19:58 -03:00
} ) ;
}
2020-12-31 15:36:19 -03:00
}
2022-11-30 09:40:09 -03:00
private async undeployPackageFromStorage ( instance : any , packageId : number ) {
2024-05-17 19:19:58 -03:00
await GuaribasPackage . destroy ( {
where : { instanceId : instance.instanceId , packageId : packageId }
} ) ;
}
2021-07-16 08:12:58 -03:00
2024-05-17 19:19:58 -03:00
private async getTextFromFile ( filename : string ) {
return new Promise < string > ( async ( resolve , reject ) = > {
textract . fromFileWithPath ( filename , { preserveLineBreaks : true } , ( error , text ) = > {
if ( error ) {
reject ( error ) ;
} else {
resolve ( text ) ;
}
} ) ;
2021-07-16 08:12:58 -03:00
} ) ;
2024-05-17 19:19:58 -03:00
}
2024-09-08 16:48:26 -03:00
public static async savePage (
min : GBMinInstance ,
url : string ,
page : Page ,
directoryPath : string
) : Promise < string | null > {
try {
// Check if the directory exists, create it if not.
const directoryExists = await GBUtil . exists ( directoryPath ) ;
if ( ! directoryExists ) {
await fs . mkdir ( directoryPath , { recursive : true } ) ; // Create directory if it doesn't exist
}
// Check if the URL is for a downloadable file (e.g., .pdf).
2024-09-10 23:25:07 -03:00
2024-09-08 16:48:26 -03:00
if (
url . endsWith ( '.pdf' ) ||
url . endsWith ( '.docx' ) ||
url . endsWith ( '.csv' ) ||
url . endsWith ( '.epub' ) ||
url . endsWith ( '.xml' ) ||
url . endsWith ( '.json' ) ||
url . endsWith ( '.txt' )
) {
const response = await fetch ( url ) ;
if ( ! response . ok ) {
throw new Error ( 'Failed to download the file' ) ;
}
const buffer = await response . arrayBuffer ( ) ; // Convert response to array buffer
const fileName = path . basename ( url ) ; // Extract file name from URL
const filePath = path . join ( directoryPath , fileName ) ; // Create file path
const data = new Uint8Array ( buffer ) ;
await fs . writeFile ( filePath , data ) ;
return filePath ; // Return the saved file path
} else {
2024-11-08 06:49:30 -03:00
await page . goto ( url , {
waitUntil : 'networkidle2' ,
timeout : 60000 // Timeout after 1 minute (60,000 ms)
} ) ;
2024-09-08 16:48:26 -03:00
const parsedUrl = new URL ( url ) ;
// Get the last part of the URL path or default to 'index' if empty
const pathParts = parsedUrl . pathname . split ( '/' ) . filter ( Boolean ) ; // Remove empty parts
const lastPath = pathParts . length > 0 ? pathParts [ pathParts . length - 1 ] : 'index' ;
const flatLastPath = lastPath . replace ( /\W+/g , '-' ) ; // Flatten the last part of the path
const fileName = ` ${ flatLastPath } .html ` ;
const filePath = path . join ( directoryPath , fileName ) ;
const htmlContent = await page . content ( ) ;
// Convert HTML to Markdown using html2md
const markdownContent = html2md ( htmlContent ) ;
// Write Markdown content to file
await fs . writeFile ( filePath , markdownContent ) ;
return filePath ;
}
} catch ( error ) {
GBLogEx . info ( min , ` Cannot save: ${ url } . ${ GBUtil . toYAML ( error ) } ` ) ;
return null ;
}
}
2018-04-21 02:59:30 -03:00
}