From ba796c86a796a07a08e83671ebfc1985f14aca4b Mon Sep 17 00:00:00 2001 From: Rodrigo Rodriguez Date: Mon, 26 Jul 2021 10:19:56 -0300 Subject: [PATCH] new(all): General Bots Reading Comprehension for .pdf. --- package-lock.json | 194 +++++++++++++++++- package.json | 2 + .../services/GBConversationalService.ts | 78 ++++--- packages/kb.gbapp/services/KBService.ts | 36 ++-- 4 files changed, 268 insertions(+), 42 deletions(-) diff --git a/package-lock.json b/package-lock.json index 902da420..ffb3313e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "botserver", - "version": "2.0.126", + "version": "2.0.127", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -1491,6 +1491,46 @@ "integrity": "sha512-DwS94K+M0vtG+cymxH0rslJr09qpdjyOLdCjmpKcG/nNiZQfMA1ybAaFEmwk9UaVlUG9STENFeQwyrLevJB+7g==", "dev": true }, + "@google-cloud/common": { + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/@google-cloud/common/-/common-3.7.0.tgz", + "integrity": "sha512-oFgpKLjH9JTOAyQd3kB36iSuH8wNSpDKb1TywlB6zcsG0xmJFxLutmfPhz03KUxRMNQOZ1K1Gc9BYvJifVnGVA==", + "requires": { + "@google-cloud/projectify": "^2.0.0", + "@google-cloud/promisify": "^2.0.0", + "arrify": "^2.0.1", + "duplexify": "^4.1.1", + "ent": "^2.2.0", + "extend": "^3.0.2", + "google-auth-library": "^7.0.2", + "retry-request": "^4.2.2", + "teeny-request": "^7.0.0" + }, + "dependencies": { + "arrify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz", + "integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug==" + }, + "debug": { + "version": "4.3.2", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.2.tgz", + "integrity": "sha512-mOp8wKcvj7XxC78zLgw/ZA+6TSgkoE2C/ienthhRD298T7UNwAg9diBpLRxC0mOezLl4B0xV7M0cCO6P/O0Xhw==", + "requires": { + "ms": "2.1.2" + } + }, + "retry-request": { + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/retry-request/-/retry-request-4.2.2.tgz", + "integrity": "sha512-xA93uxUD/rogV7BV59agW/JHPGXeREMWiZc9jhcwY4YdZ7QOtC7qbomYg0n4wyk2lJhggjvKvhNX8wln/Aldhg==", + "requires": { + "debug": "^4.1.1", + "extend": "^3.0.2" + } + } + } + }, "@google-cloud/paginator": { "version": "3.0.5", "resolved": "https://registry.npmjs.org/@google-cloud/paginator/-/paginator-3.0.5.tgz", @@ -1556,6 +1596,94 @@ } } }, + "@google-cloud/translate": { + "version": "6.2.6", + "resolved": "https://registry.npmjs.org/@google-cloud/translate/-/translate-6.2.6.tgz", + "integrity": "sha512-DzXly5s9RtkVCkPk/AxjZV2HQ4b4eN2Dvg+8x8d8Yk/tnKpU1IOCX6lWUGIONMNHMKbN6ITydXj+quo92tvZOg==", + "requires": { + "@google-cloud/common": "^3.0.0", + "@google-cloud/promisify": "^2.0.0", + "arrify": "^2.0.0", + "extend": "^3.0.2", + "google-gax": "^2.17.1", + "is-html": "^2.0.0", + "protobufjs": "^6.8.8" + }, + "dependencies": { + "arrify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz", + "integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug==" + }, + "google-auth-library": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-7.3.0.tgz", + "integrity": "sha512-MPeeMlnsYnoiiVFMwX3hgaS684aiXrSqKoDP+xL4Ejg4Z0qLvIeg4XsaChemyFI8ZUO7ApwDAzNtgmhWSDNh5w==", + "requires": { + "arrify": "^2.0.0", + "base64-js": "^1.3.0", + "ecdsa-sig-formatter": "^1.0.11", + "fast-text-encoding": "^1.0.0", + "gaxios": "^4.0.0", + "gcp-metadata": "^4.2.0", + "gtoken": "^5.0.4", + "jws": "^4.0.0", + "lru-cache": "^6.0.0" + } + }, + "google-gax": { + "version": "2.19.0", + "resolved": "https://registry.npmjs.org/google-gax/-/google-gax-2.19.0.tgz", + "integrity": "sha512-2a6WY+p6YMVMmwXmkRqiLreXx67xHDZhkmflcL8aDUkl1csx9ywxEI01veoDXy6T1l0JJD6zLbl5TIbWimmXrw==", + "requires": { + "@grpc/grpc-js": "~1.3.0", + "@grpc/proto-loader": "^0.6.1", + "@types/long": "^4.0.0", + "abort-controller": "^3.0.0", + "duplexify": "^4.0.0", + "fast-text-encoding": "^1.0.3", + "google-auth-library": "^7.3.0", + "is-stream-ended": "^0.1.4", + "node-fetch": "^2.6.1", + "object-hash": "^2.1.1", + "protobufjs": "^6.10.2", + "retry-request": "^4.0.0" + } + }, + "jwa": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.0.tgz", + "integrity": "sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==", + "requires": { + "buffer-equal-constant-time": "1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "jws": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.0.tgz", + "integrity": "sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==", + "requires": { + "jwa": "^2.0.0", + "safe-buffer": "^5.0.1" + } + }, + "lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "requires": { + "yallist": "^4.0.0" + } + }, + "yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==" + } + } + }, "@grpc/grpc-js": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/@grpc/grpc-js/-/grpc-js-1.3.2.tgz", @@ -8199,6 +8327,11 @@ "once": "^1.4.0" } }, + "ent": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/ent/-/ent-2.2.0.tgz", + "integrity": "sha1-6WQhkyWiHQX0RGai9obtbOX13R0=" + }, "entities": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", @@ -10146,6 +10279,11 @@ "integrity": "sha512-aI5tKwNTBzOZApHIynaAwecLBv8TlZTEy/P4Sj2SzzAhBrGuI8yGZ0UIXVPQzOHGS+to2mjb04iy6VWt/8+d8A==", "dev": true }, + "html-tags": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/html-tags/-/html-tags-3.1.0.tgz", + "integrity": "sha512-1qYz89hW3lFDEazhjW0yVAV87lw8lVkrJocr72XmBkMKsoSVJCQx3W8BXsC7hO2qAt8BoVjYjtAcZ9perqGnNg==" + }, "htmlparser2": { "version": "3.10.1", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.1.tgz", @@ -10889,6 +11027,14 @@ "integrity": "sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==", "dev": true }, + "is-html": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-html/-/is-html-2.0.0.tgz", + "integrity": "sha512-S+OpgB5i7wzIue/YSE5hg0e5ZYfG3hhpNh9KGl6ayJ38p7ED6wxQLd1TV91xHpcTvw90KMJ9EwN3F/iNflHBVg==", + "requires": { + "html-tags": "^3.0.0" + } + }, "is-installed-globally": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/is-installed-globally/-/is-installed-globally-0.1.0.tgz", @@ -13219,6 +13365,11 @@ "lodash.toarray": "^4.4.0" } }, + "node-ensure": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz", + "integrity": "sha1-7K52QVDemYYexcgQ/V0Jaxg5Mqc=" + }, "node-fetch": { "version": "2.6.1", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz", @@ -16306,6 +16457,15 @@ "pinkie-promise": "^2.0.0" } }, + "pdf-extraction": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pdf-extraction/-/pdf-extraction-1.0.2.tgz", + "integrity": "sha512-wVA4HvsvaNYVAH6wp0Tt5+AUHV3XIGM2KQMlOyblsn0YDSUKtTwCJq87F7vIbBnnKsc3noSpL/Bx/sfB1ZqpLA==", + "requires": { + "debug": "^3.1.0", + "node-ensure": "^0.0.0" + } + }, "pdf-text-extract": { "version": "1.3.1", "resolved": "https://registry.npmjs.org/pdf-text-extract/-/pdf-text-extract-1.3.1.tgz", @@ -19210,6 +19370,14 @@ "readable-stream": "^2.0.2" } }, + "stream-events": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/stream-events/-/stream-events-1.0.5.tgz", + "integrity": "sha512-E1GUzBSgvct8Jsb3v2X15pjzN1tYebtbLaMg+eBOUOAxgbLoSbT2NS91ckc5lJD1KfLjId+jXJRgo0qnV5Nerg==", + "requires": { + "stubs": "^3.0.0" + } + }, "stream-shift": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.1.tgz", @@ -19325,6 +19493,11 @@ "escape-string-regexp": "^1.0.2" } }, + "stubs": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/stubs/-/stubs-3.0.0.tgz", + "integrity": "sha1-6NK6H6nJBXAwPAMLaQD31fiavls=" + }, "superagent": { "version": "3.8.3", "resolved": "https://registry.npmjs.org/superagent/-/superagent-3.8.3.tgz", @@ -19611,6 +19784,25 @@ } } }, + "teeny-request": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/teeny-request/-/teeny-request-7.1.1.tgz", + "integrity": "sha512-iwY6rkW5DDGq8hE2YgNQlKbptYpY5Nn2xecjQiNjOXWbKzPGUfmeUBCSQbbr306d7Z7U2N0TPl+/SwYRfua1Dg==", + "requires": { + "http-proxy-agent": "^4.0.0", + "https-proxy-agent": "^5.0.0", + "node-fetch": "^2.6.1", + "stream-events": "^1.0.5", + "uuid": "^8.0.0" + }, + "dependencies": { + "uuid": { + "version": "8.3.2", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz", + "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==" + } + } + }, "temp-dir": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/temp-dir/-/temp-dir-2.0.0.tgz", diff --git a/package.json b/package.json index 9ab8130b..9fba6b75 100644 --- a/package.json +++ b/package.json @@ -50,6 +50,7 @@ "dependencies": { "@azure/ms-rest-js": "2.5.1", "@google-cloud/pubsub": "^2.13.0", + "@google-cloud/translate": "^6.2.6", "@microsoft/microsoft-graph-client": "2.2.1", "@semantic-release/changelog": "5.0.1", "@semantic-release/exec": "5.0.0", @@ -89,6 +90,7 @@ "nexmo": "2.9.1", "node-cron": "3.0.0", "opn": "6.0.0", + "pdf-extraction": "^1.0.2", "phone": "2.4.21", "pragmatismo-io-framework": "1.0.20", "prism-media": "1.3.1", diff --git a/packages/core.gbapp/services/GBConversationalService.ts b/packages/core.gbapp/services/GBConversationalService.ts index f833470b..fa6167e2 100644 --- a/packages/core.gbapp/services/GBConversationalService.ts +++ b/packages/core.gbapp/services/GBConversationalService.ts @@ -62,6 +62,7 @@ const fs = require('fs'); const SpeechToTextV1 = require('ibm-watson/speech-to-text/v1'); const { IamAuthenticator } = require('ibm-watson/auth'); const marked = require('marked'); +const { Translate } = require('@google-cloud/translate').v2; /** * Provides basic services for handling messages and dispatching to back-end @@ -771,37 +772,60 @@ export class GBConversationalService { } text = text.replace('¿', ''); - let options = { - method: 'POST', - baseUrl: endPoint, - url: 'translate', - qs: { - 'api-version': '3.0', - to: [language] - }, - headers: { - 'Ocp-Apim-Subscription-Key': key, - 'Ocp-Apim-Subscription-Region': 'westeurope', - 'Content-type': 'application/json', - 'X-ClientTraceId': uuidv4().toString() - }, - body: [ - { - text: text - } - ], - json: true - }; + if (min.instance.googleProjectId) { + // Instantiates a client - try { + const translate = new Translate({ + projectId: min.instance.googleProjectId, + credentials: { client_email: min.instance.googleClientEmail, private_key: min.instance.googlePrivateKey.replace(/\\n/gm, '\n') } + }); - const results = await request(options); + try { - return results[0].translations[0].text; - } catch (error) { - const msg = `Error calling Translator service layer. Error is: ${error}.`; + const [translation] = await translate.translate(text, language); - return Promise.reject(new Error(msg)); + return translation; + } catch (error) { + const msg = `Error calling Google Translator service layer. Error is: ${error}.`; + + return Promise.reject(new Error(msg)); + } + + } + else { + + let options = { + method: 'POST', + baseUrl: endPoint, + url: 'translate', + qs: { + 'api-version': '3.0', + to: [language] + }, + headers: { + 'Ocp-Apim-Subscription-Key': key, + 'Ocp-Apim-Subscription-Region': 'westeurope', + 'Content-type': 'application/json', + 'X-ClientTraceId': uuidv4().toString() + }, + body: [ + { + text: text + } + ], + json: true + }; + + try { + + const results = await request(options); + + return results[0].translations[0].text; + } catch (error) { + const msg = `Error calling MSFT Translator service layer. Error is: ${error}.`; + + return Promise.reject(new Error(msg)); + } } } diff --git a/packages/kb.gbapp/services/KBService.ts b/packages/kb.gbapp/services/KBService.ts index e2415a00..f0ea6f3f 100644 --- a/packages/kb.gbapp/services/KBService.ts +++ b/packages/kb.gbapp/services/KBService.ts @@ -65,6 +65,7 @@ import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models'; import { GBConfigService } from './../../core.gbapp/services/GBConfigService'; const request = require('request-promise-native'); const textract = require('textract'); +const pdf = require("pdf-extraction"); /** * Result for quey on KB data. @@ -613,21 +614,28 @@ export class KBService implements IGBKBService { const files = await walkPromise(urlJoin(localPath, 'docs')); await CollectionUtil.asyncForEach(files, async file => { - if (file !== null && file.name.endsWith('.docx')) { - let content = await this.getTextFromWord(Path.join(file.root, file.name)); - - content = await min.conversationalService.translate(min, content, 'en'); - - if (content) { - await GuaribasAnswer.create({ - instanceId: instance.instanceId, - content: content, - format: '.docx', - media: file.name, - packageId: packageId - }); + let content = null; + let filePath = Path.join(file.root, file.name); + if (file !== null) { + if (file.name.endsWith('.docx')) { + content = await this.getTextFromFile(filePath); + } else if (file.name.endsWith('.pdf')) { + const read = await pdf(Fs.readFileSync(filePath)); + content = read.text; } } + + if (content) { + content = await min.conversationalService.translate(min, content, 'en'); + await GuaribasAnswer.create({ + instanceId: instance.instanceId, + content: content, + format: '.docx', + media: file.name, + packageId: packageId + }); + } + }); } @@ -762,7 +770,7 @@ export class KBService implements IGBKBService { return await request.post(options); } - private async getTextFromWord(filename: string) { + private async getTextFromFile(filename: string) { return new Promise(async (resolve, reject) => { textract.fromFileWithPath(filename, { preserveLineBreaks: true }, (error, text) => { if (error) {