new(kb.gbapp): #259 .docx import done - first part.
This commit is contained in:
parent
fc55dda78c
commit
5d112db576
2 changed files with 84 additions and 12 deletions
|
@ -118,6 +118,7 @@
|
||||||
"ms-rest-azure": "3.0.0",
|
"ms-rest-azure": "3.0.0",
|
||||||
"nexmo": "2.9.1",
|
"nexmo": "2.9.1",
|
||||||
"node-cron": "3.0.2",
|
"node-cron": "3.0.2",
|
||||||
|
"node-html-parser": "^6.1.5",
|
||||||
"node-nlp": "4.24.0",
|
"node-nlp": "4.24.0",
|
||||||
"node-tesseract-ocr": "2.2.1",
|
"node-tesseract-ocr": "2.2.1",
|
||||||
"npm": "9.1.2",
|
"npm": "9.1.2",
|
||||||
|
|
|
@ -62,6 +62,7 @@ import { GBDeployer } from '../../core.gbapp/services/GBDeployer.js';
|
||||||
import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js';
|
import { CSService } from '../../customer-satisfaction.gbapp/services/CSService.js';
|
||||||
import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js';
|
import { GuaribasAnswer, GuaribasQuestion, GuaribasSubject } from '../models/index.js';
|
||||||
import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js';
|
import { GBConfigService } from './../../core.gbapp/services/GBConfigService.js';
|
||||||
|
import { parse } from 'node-html-parser';
|
||||||
import textract from 'textract';
|
import textract from 'textract';
|
||||||
import pdf from 'pdf-extraction';
|
import pdf from 'pdf-extraction';
|
||||||
import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
|
import { GBSSR } from '../../core.gbapp/services/GBSSR.js';
|
||||||
|
@ -70,6 +71,7 @@ import mammoth from 'mammoth';
|
||||||
import { url } from 'inspector';
|
import { url } from 'inspector';
|
||||||
import { min } from 'lodash';
|
import { min } from 'lodash';
|
||||||
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
|
import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
|
||||||
|
import { text } from 'body-parser';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Result for quey on KB data.
|
* Result for quey on KB data.
|
||||||
|
@ -639,6 +641,7 @@ export class KBService implements IGBKBService {
|
||||||
*/
|
*/
|
||||||
public async importRemainingArticles(localPath: string, instance: IGBInstance, packageId: number): Promise<any> {
|
public async importRemainingArticles(localPath: string, instance: IGBInstance, packageId: number): Promise<any> {
|
||||||
const files = await walkPromise(urlJoin(localPath, 'articles'));
|
const files = await walkPromise(urlJoin(localPath, 'articles'));
|
||||||
|
const data = { questions: [], answers: [] };
|
||||||
|
|
||||||
await CollectionUtil.asyncForEach(files, async file => {
|
await CollectionUtil.asyncForEach(files, async file => {
|
||||||
if (file !== null && file.name.endsWith('.md')) {
|
if (file !== null && file.name.endsWith('.md')) {
|
||||||
|
@ -660,11 +663,11 @@ export class KBService implements IGBKBService {
|
||||||
} else if (file !== null && file.name.endsWith('.docx')) {
|
} else if (file !== null && file.name.endsWith('.docx')) {
|
||||||
const gbaiName = `${instance.botId}.gbai`;
|
const gbaiName = `${instance.botId}.gbai`;
|
||||||
const gbkbName = `${instance.botId}.gbkb`;
|
const gbkbName = `${instance.botId}.gbkb`;
|
||||||
const localName = Path.join('work', gbaiName, gbkbName, file.name);
|
const localName = Path.join('work', gbaiName, gbkbName, 'articles', file.name);
|
||||||
const buffer = Fs.readFileSync(localName, { encoding: null });
|
const buffer = Fs.readFileSync(localName, { encoding: null });
|
||||||
var options = {
|
var options = {
|
||||||
buffer: buffer,
|
buffer: buffer,
|
||||||
convertImage: mammoth.images.imgElement(async image => {
|
convertImage: async image => {
|
||||||
const localName = Path.join(
|
const localName = Path.join(
|
||||||
'work',
|
'work',
|
||||||
gbaiName,
|
gbaiName,
|
||||||
|
@ -675,19 +678,87 @@ export class KBService implements IGBKBService {
|
||||||
const buffer = await image.read();
|
const buffer = await image.read();
|
||||||
Fs.writeFileSync(localName, buffer, { encoding: null });
|
Fs.writeFileSync(localName, buffer, { encoding: null });
|
||||||
return { src: url };
|
return { src: url };
|
||||||
})
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const answer = await mammoth.convertToHtml(options);
|
let state = 0;
|
||||||
await GuaribasAnswer.create(<GuaribasAnswer>{
|
let previousState = state;
|
||||||
|
const next = (root, el, data) => {
|
||||||
|
// If it is root, change to the first item.
|
||||||
|
|
||||||
|
if (el.parentNode == null) {
|
||||||
|
el = el.firstChild;
|
||||||
|
}
|
||||||
|
let value = el.innerHTML;
|
||||||
|
const isHeader = el => el.rawTagName.startsWith('h') && el.rawTagName.length === 2;
|
||||||
|
|
||||||
|
// Handle questions from H* elements.
|
||||||
|
|
||||||
|
if (state === 0) {
|
||||||
|
const question = {
|
||||||
|
from: 'document',
|
||||||
|
to: '',
|
||||||
|
subject1: '',
|
||||||
|
subject2: '',
|
||||||
|
subject3: '',
|
||||||
|
subject4: '',
|
||||||
|
content: value.replace(/["]+/g, ''),
|
||||||
instanceId: instance.instanceId,
|
instanceId: instance.instanceId,
|
||||||
content: answer.value,
|
skipIndex: 0,
|
||||||
format: '.docx',
|
packageId: packageId
|
||||||
|
};
|
||||||
|
data.questions.push(question);
|
||||||
|
previousState = state;
|
||||||
|
state = 1;
|
||||||
|
|
||||||
|
// Everything else is content for that Header.
|
||||||
|
} else if (state === 1) {
|
||||||
|
|
||||||
|
// If next element is null, the tree has been passed, so
|
||||||
|
// finish the append of other elements between the last Header
|
||||||
|
// and the end of the document.
|
||||||
|
|
||||||
|
if (!el.nextSibling || isHeader(el.nextSibling)) {
|
||||||
|
const answer = {
|
||||||
|
instanceId: instance.instanceId,
|
||||||
|
content: value,
|
||||||
|
format: '.html',
|
||||||
media: file.name,
|
media: file.name,
|
||||||
packageId: packageId,
|
packageId: packageId,
|
||||||
prevId: 0 // https://github.com/GeneralBots/BotServer/issues/312
|
prevId: 0
|
||||||
});
|
};
|
||||||
|
|
||||||
|
data.answers.push(answer);
|
||||||
|
|
||||||
|
state = 0;
|
||||||
|
|
||||||
|
// Otherwise, just append content to insert later.
|
||||||
|
|
||||||
|
} else {
|
||||||
|
value += value;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Goes to the next node, as it is all same level nodes.
|
||||||
|
|
||||||
|
if (el.nextSibling) {
|
||||||
|
next(root, el.nextSibling, data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const html = await mammoth.convertToHtml(options);
|
||||||
|
const root = parse(html.value);
|
||||||
|
next(root, root, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Persist to storage.
|
||||||
|
|
||||||
|
const answersCreated = await GuaribasAnswer.bulkCreate(data.answers);
|
||||||
|
let i = 0;
|
||||||
|
await CollectionUtil.asyncForEach(data.questions, async question => {
|
||||||
|
question.answerId = answersCreated[i++].answerId;
|
||||||
|
});
|
||||||
|
return await GuaribasQuestion.bulkCreate(data.questions);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue