new(all): General Bots Reading Comprehension for .pdf.

This commit is contained in:
Rodrigo Rodriguez 2021-07-26 10:19:56 -03:00
parent 1d337cf24a
commit ba796c86a7
4 changed files with 268 additions and 42 deletions

194
package-lock.json generated
View file

@ -1,6 +1,6 @@
{
"name": "botserver",
"version": "2.0.126",
"version": "2.0.127",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@ -1491,6 +1491,46 @@
"integrity": "sha512-DwS94K+M0vtG+cymxH0rslJr09qpdjyOLdCjmpKcG/nNiZQfMA1ybAaFEmwk9UaVlUG9STENFeQwyrLevJB+7g==",
"dev": true
},
"@google-cloud/common": {
"version": "3.7.0",
"resolved": "https://registry.npmjs.org/@google-cloud/common/-/common-3.7.0.tgz",
"integrity": "sha512-oFgpKLjH9JTOAyQd3kB36iSuH8wNSpDKb1TywlB6zcsG0xmJFxLutmfPhz03KUxRMNQOZ1K1Gc9BYvJifVnGVA==",
"requires": {
"@google-cloud/projectify": "^2.0.0",
"@google-cloud/promisify": "^2.0.0",
"arrify": "^2.0.1",
"duplexify": "^4.1.1",
"ent": "^2.2.0",
"extend": "^3.0.2",
"google-auth-library": "^7.0.2",
"retry-request": "^4.2.2",
"teeny-request": "^7.0.0"
},
"dependencies": {
"arrify": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz",
"integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug=="
},
"debug": {
"version": "4.3.2",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.2.tgz",
"integrity": "sha512-mOp8wKcvj7XxC78zLgw/ZA+6TSgkoE2C/ienthhRD298T7UNwAg9diBpLRxC0mOezLl4B0xV7M0cCO6P/O0Xhw==",
"requires": {
"ms": "2.1.2"
}
},
"retry-request": {
"version": "4.2.2",
"resolved": "https://registry.npmjs.org/retry-request/-/retry-request-4.2.2.tgz",
"integrity": "sha512-xA93uxUD/rogV7BV59agW/JHPGXeREMWiZc9jhcwY4YdZ7QOtC7qbomYg0n4wyk2lJhggjvKvhNX8wln/Aldhg==",
"requires": {
"debug": "^4.1.1",
"extend": "^3.0.2"
}
}
}
},
"@google-cloud/paginator": {
"version": "3.0.5",
"resolved": "https://registry.npmjs.org/@google-cloud/paginator/-/paginator-3.0.5.tgz",
@ -1556,6 +1596,94 @@
}
}
},
"@google-cloud/translate": {
"version": "6.2.6",
"resolved": "https://registry.npmjs.org/@google-cloud/translate/-/translate-6.2.6.tgz",
"integrity": "sha512-DzXly5s9RtkVCkPk/AxjZV2HQ4b4eN2Dvg+8x8d8Yk/tnKpU1IOCX6lWUGIONMNHMKbN6ITydXj+quo92tvZOg==",
"requires": {
"@google-cloud/common": "^3.0.0",
"@google-cloud/promisify": "^2.0.0",
"arrify": "^2.0.0",
"extend": "^3.0.2",
"google-gax": "^2.17.1",
"is-html": "^2.0.0",
"protobufjs": "^6.8.8"
},
"dependencies": {
"arrify": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz",
"integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug=="
},
"google-auth-library": {
"version": "7.3.0",
"resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-7.3.0.tgz",
"integrity": "sha512-MPeeMlnsYnoiiVFMwX3hgaS684aiXrSqKoDP+xL4Ejg4Z0qLvIeg4XsaChemyFI8ZUO7ApwDAzNtgmhWSDNh5w==",
"requires": {
"arrify": "^2.0.0",
"base64-js": "^1.3.0",
"ecdsa-sig-formatter": "^1.0.11",
"fast-text-encoding": "^1.0.0",
"gaxios": "^4.0.0",
"gcp-metadata": "^4.2.0",
"gtoken": "^5.0.4",
"jws": "^4.0.0",
"lru-cache": "^6.0.0"
}
},
"google-gax": {
"version": "2.19.0",
"resolved": "https://registry.npmjs.org/google-gax/-/google-gax-2.19.0.tgz",
"integrity": "sha512-2a6WY+p6YMVMmwXmkRqiLreXx67xHDZhkmflcL8aDUkl1csx9ywxEI01veoDXy6T1l0JJD6zLbl5TIbWimmXrw==",
"requires": {
"@grpc/grpc-js": "~1.3.0",
"@grpc/proto-loader": "^0.6.1",
"@types/long": "^4.0.0",
"abort-controller": "^3.0.0",
"duplexify": "^4.0.0",
"fast-text-encoding": "^1.0.3",
"google-auth-library": "^7.3.0",
"is-stream-ended": "^0.1.4",
"node-fetch": "^2.6.1",
"object-hash": "^2.1.1",
"protobufjs": "^6.10.2",
"retry-request": "^4.0.0"
}
},
"jwa": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.0.tgz",
"integrity": "sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==",
"requires": {
"buffer-equal-constant-time": "1.0.1",
"ecdsa-sig-formatter": "1.0.11",
"safe-buffer": "^5.0.1"
}
},
"jws": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/jws/-/jws-4.0.0.tgz",
"integrity": "sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==",
"requires": {
"jwa": "^2.0.0",
"safe-buffer": "^5.0.1"
}
},
"lru-cache": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
"integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
"requires": {
"yallist": "^4.0.0"
}
},
"yallist": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz",
"integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A=="
}
}
},
"@grpc/grpc-js": {
"version": "1.3.2",
"resolved": "https://registry.npmjs.org/@grpc/grpc-js/-/grpc-js-1.3.2.tgz",
@ -8199,6 +8327,11 @@
"once": "^1.4.0"
}
},
"ent": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/ent/-/ent-2.2.0.tgz",
"integrity": "sha1-6WQhkyWiHQX0RGai9obtbOX13R0="
},
"entities": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
@ -10146,6 +10279,11 @@
"integrity": "sha512-aI5tKwNTBzOZApHIynaAwecLBv8TlZTEy/P4Sj2SzzAhBrGuI8yGZ0UIXVPQzOHGS+to2mjb04iy6VWt/8+d8A==",
"dev": true
},
"html-tags": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/html-tags/-/html-tags-3.1.0.tgz",
"integrity": "sha512-1qYz89hW3lFDEazhjW0yVAV87lw8lVkrJocr72XmBkMKsoSVJCQx3W8BXsC7hO2qAt8BoVjYjtAcZ9perqGnNg=="
},
"htmlparser2": {
"version": "3.10.1",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.1.tgz",
@ -10889,6 +11027,14 @@
"integrity": "sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==",
"dev": true
},
"is-html": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/is-html/-/is-html-2.0.0.tgz",
"integrity": "sha512-S+OpgB5i7wzIue/YSE5hg0e5ZYfG3hhpNh9KGl6ayJ38p7ED6wxQLd1TV91xHpcTvw90KMJ9EwN3F/iNflHBVg==",
"requires": {
"html-tags": "^3.0.0"
}
},
"is-installed-globally": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/is-installed-globally/-/is-installed-globally-0.1.0.tgz",
@ -13219,6 +13365,11 @@
"lodash.toarray": "^4.4.0"
}
},
"node-ensure": {
"version": "0.0.0",
"resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz",
"integrity": "sha1-7K52QVDemYYexcgQ/V0Jaxg5Mqc="
},
"node-fetch": {
"version": "2.6.1",
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz",
@ -16306,6 +16457,15 @@
"pinkie-promise": "^2.0.0"
}
},
"pdf-extraction": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/pdf-extraction/-/pdf-extraction-1.0.2.tgz",
"integrity": "sha512-wVA4HvsvaNYVAH6wp0Tt5+AUHV3XIGM2KQMlOyblsn0YDSUKtTwCJq87F7vIbBnnKsc3noSpL/Bx/sfB1ZqpLA==",
"requires": {
"debug": "^3.1.0",
"node-ensure": "^0.0.0"
}
},
"pdf-text-extract": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/pdf-text-extract/-/pdf-text-extract-1.3.1.tgz",
@ -19210,6 +19370,14 @@
"readable-stream": "^2.0.2"
}
},
"stream-events": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/stream-events/-/stream-events-1.0.5.tgz",
"integrity": "sha512-E1GUzBSgvct8Jsb3v2X15pjzN1tYebtbLaMg+eBOUOAxgbLoSbT2NS91ckc5lJD1KfLjId+jXJRgo0qnV5Nerg==",
"requires": {
"stubs": "^3.0.0"
}
},
"stream-shift": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.1.tgz",
@ -19325,6 +19493,11 @@
"escape-string-regexp": "^1.0.2"
}
},
"stubs": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/stubs/-/stubs-3.0.0.tgz",
"integrity": "sha1-6NK6H6nJBXAwPAMLaQD31fiavls="
},
"superagent": {
"version": "3.8.3",
"resolved": "https://registry.npmjs.org/superagent/-/superagent-3.8.3.tgz",
@ -19611,6 +19784,25 @@
}
}
},
"teeny-request": {
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/teeny-request/-/teeny-request-7.1.1.tgz",
"integrity": "sha512-iwY6rkW5DDGq8hE2YgNQlKbptYpY5Nn2xecjQiNjOXWbKzPGUfmeUBCSQbbr306d7Z7U2N0TPl+/SwYRfua1Dg==",
"requires": {
"http-proxy-agent": "^4.0.0",
"https-proxy-agent": "^5.0.0",
"node-fetch": "^2.6.1",
"stream-events": "^1.0.5",
"uuid": "^8.0.0"
},
"dependencies": {
"uuid": {
"version": "8.3.2",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz",
"integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg=="
}
}
},
"temp-dir": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/temp-dir/-/temp-dir-2.0.0.tgz",

View file

@ -50,6 +50,7 @@
"dependencies": {
"@azure/ms-rest-js": "2.5.1",
"@google-cloud/pubsub": "^2.13.0",
"@google-cloud/translate": "^6.2.6",
"@microsoft/microsoft-graph-client": "2.2.1",
"@semantic-release/changelog": "5.0.1",
"@semantic-release/exec": "5.0.0",
@ -89,6 +90,7 @@
"nexmo": "2.9.1",
"node-cron": "3.0.0",
"opn": "6.0.0",
"pdf-extraction": "^1.0.2",
"phone": "2.4.21",
"pragmatismo-io-framework": "1.0.20",
"prism-media": "1.3.1",

View file

@ -62,6 +62,7 @@ const fs = require('fs');
const SpeechToTextV1 = require('ibm-watson/speech-to-text/v1');
const { IamAuthenticator } = require('ibm-watson/auth');
const marked = require('marked');
const { Translate } = require('@google-cloud/translate').v2;
/**
* Provides basic services for handling messages and dispatching to back-end
@ -771,37 +772,60 @@ export class GBConversationalService {
}
text = text.replace('¿', '');
let options = {
method: 'POST',
baseUrl: endPoint,
url: 'translate',
qs: {
'api-version': '3.0',
to: [language]
},
headers: {
'Ocp-Apim-Subscription-Key': key,
'Ocp-Apim-Subscription-Region': 'westeurope',
'Content-type': 'application/json',
'X-ClientTraceId': uuidv4().toString()
},
body: [
{
text: text
}
],
json: true
};
if (min.instance.googleProjectId) {
// Instantiates a client
try {
const translate = new Translate({
projectId: min.instance.googleProjectId,
credentials: { client_email: min.instance.googleClientEmail, private_key: min.instance.googlePrivateKey.replace(/\\n/gm, '\n') }
});
const results = await request(options);
try {
return results[0].translations[0].text;
} catch (error) {
const msg = `Error calling Translator service layer. Error is: ${error}.`;
const [translation] = await translate.translate(text, language);
return Promise.reject(new Error(msg));
return translation;
} catch (error) {
const msg = `Error calling Google Translator service layer. Error is: ${error}.`;
return Promise.reject(new Error(msg));
}
}
else {
let options = {
method: 'POST',
baseUrl: endPoint,
url: 'translate',
qs: {
'api-version': '3.0',
to: [language]
},
headers: {
'Ocp-Apim-Subscription-Key': key,
'Ocp-Apim-Subscription-Region': 'westeurope',
'Content-type': 'application/json',
'X-ClientTraceId': uuidv4().toString()
},
body: [
{
text: text
}
],
json: true
};
try {
const results = await request(options);
return results[0].translations[0].text;
} catch (error) {
const msg = `Error calling MSFT Translator service layer. Error is: ${error}.`;
return Promise.reject(new Error(msg));
}
}
}

View file

@ -65,6 +65,7 @@ import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models';
import { GBConfigService } from './../../core.gbapp/services/GBConfigService';
const request = require('request-promise-native');
const textract = require('textract');
const pdf = require("pdf-extraction");
/**
* Result for quey on KB data.
@ -613,21 +614,28 @@ export class KBService implements IGBKBService {
const files = await walkPromise(urlJoin(localPath, 'docs'));
await CollectionUtil.asyncForEach(files, async file => {
if (file !== null && file.name.endsWith('.docx')) {
let content = await this.getTextFromWord(Path.join(file.root, file.name));
content = await min.conversationalService.translate(min, content, 'en');
if (content) {
await GuaribasAnswer.create({
instanceId: instance.instanceId,
content: content,
format: '.docx',
media: file.name,
packageId: packageId
});
let content = null;
let filePath = Path.join(file.root, file.name);
if (file !== null) {
if (file.name.endsWith('.docx')) {
content = await this.getTextFromFile(filePath);
} else if (file.name.endsWith('.pdf')) {
const read = await pdf(Fs.readFileSync(filePath));
content = read.text;
}
}
if (content) {
content = await min.conversationalService.translate(min, content, 'en');
await GuaribasAnswer.create({
instanceId: instance.instanceId,
content: content,
format: '.docx',
media: file.name,
packageId: packageId
});
}
});
}
@ -762,7 +770,7 @@ export class KBService implements IGBKBService {
return await request.post(options);
}
private async getTextFromWord(filename: string) {
private async getTextFromFile(filename: string) {
return new Promise<string>(async (resolve, reject) => {
textract.fromFileWithPath(filename, { preserveLineBreaks: true }, (error, text) => {
if (error) {