feat(KeywordsExpressions, DialogKeywords, AdminDialog, GBDeployer, KBService): enhance functionality and improve file handling
All checks were successful
GBCI / build (push) Successful in 1m20s
All checks were successful
GBCI / build (push) Successful in 1m20s
This commit is contained in:
parent
1ef6cbb032
commit
f078881b0d
7 changed files with 122 additions and 38 deletions
|
|
@ -293,6 +293,8 @@ class AdminDialog extends IGBDialog {
|
||||||
await min.conversationalService.sendText(min, step, `Starting publishing for ${botId} packages...`);
|
await min.conversationalService.sendText(min, step, `Starting publishing for ${botId} packages...`);
|
||||||
packages.push(`${botId}.gbot`);
|
packages.push(`${botId}.gbot`);
|
||||||
packages.push(`${botId}.gbtheme`);
|
packages.push(`${botId}.gbtheme`);
|
||||||
|
packages.push(`${botId}.gbdrive`);
|
||||||
|
packages.push(`${botId}.gbdata`);
|
||||||
packages.push(`${botId}.gbkb`);
|
packages.push(`${botId}.gbkb`);
|
||||||
packages.push(`${botId}.gbdialog`);
|
packages.push(`${botId}.gbdialog`);
|
||||||
skipError = true;
|
skipError = true;
|
||||||
|
|
@ -305,6 +307,8 @@ class AdminDialog extends IGBDialog {
|
||||||
|
|
||||||
if (
|
if (
|
||||||
packageName.toLowerCase() === 'gbdialog' ||
|
packageName.toLowerCase() === 'gbdialog' ||
|
||||||
|
packageName.toLowerCase() === 'gbdrive' ||
|
||||||
|
packageName.toLowerCase() === 'gbdata' ||
|
||||||
packageName.toLowerCase() === 'gbkb' ||
|
packageName.toLowerCase() === 'gbkb' ||
|
||||||
packageName.toLowerCase() === 'gbot' ||
|
packageName.toLowerCase() === 'gbot' ||
|
||||||
packageName.toLowerCase() === 'gbtheme'
|
packageName.toLowerCase() === 'gbtheme'
|
||||||
|
|
|
||||||
|
|
@ -721,8 +721,9 @@ export class DialogKeywords {
|
||||||
proc.roles = role;
|
proc.roles = role;
|
||||||
|
|
||||||
// Checks access.
|
// Checks access.
|
||||||
|
|
||||||
const filters = ['People.xlsx', `${role}=x`, `id=${user.userSystemId}`];
|
const file = process.env.GB_MODE === 'legacy' ? 'People.xlsx' : 'people.csv';
|
||||||
|
const filters = [file, `${role}=x`, `id=${user.userSystemId}`];
|
||||||
const people = await sys.find({ pid, handle: null, args: filters });
|
const people = await sys.find({ pid, handle: null, args: filters });
|
||||||
|
|
||||||
if (!people) {
|
if (!people) {
|
||||||
|
|
|
||||||
|
|
@ -522,6 +522,14 @@ export class KeywordsExpressions {
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
|
keywords[i++] = [
|
||||||
|
/^\s*(LOG)(\s*)(.*)/gim,
|
||||||
|
($0, $1, $2, $3) => {
|
||||||
|
const params = this.getParams($3, ['obj']);
|
||||||
|
return `await sys.log ({pid: pid, ${params}})`;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
keywords[i++] = [
|
keywords[i++] = [
|
||||||
/^\s*(.*)\=\s*(DIR)(\s*)(.*)/gim,
|
/^\s*(.*)\=\s*(DIR)(\s*)(.*)/gim,
|
||||||
($0, $1, $2, $3, $4) => {
|
($0, $1, $2, $3, $4) => {
|
||||||
|
|
|
||||||
|
|
@ -510,7 +510,7 @@ export class GBDeployer implements IGBDeployer {
|
||||||
localPath: string,
|
localPath: string,
|
||||||
remotePath: string,
|
remotePath: string,
|
||||||
baseUrl: string = null,
|
baseUrl: string = null,
|
||||||
client = null
|
client = null, onlyTextFiles = false
|
||||||
): Promise<any> {
|
): Promise<any> {
|
||||||
const storageMode = process.env.GB_MODE;
|
const storageMode = process.env.GB_MODE;
|
||||||
|
|
||||||
|
|
@ -547,6 +547,19 @@ export class GBDeployer implements IGBDeployer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (onlyTextFiles && !obj.name.endsWith('.txt') || !obj.name.endsWith('.json')
|
||||||
|
&& !obj.name.endsWith('.csv') && !obj.name.endsWith('.xlsx') && !obj.name.endsWith('.xls')
|
||||||
|
&& !obj.name.endsWith('.xlsm') && !obj.name.endsWith('.xlsb') && !obj.name.endsWith('.xml')
|
||||||
|
&& !obj.name.endsWith('.html') && !obj.name.endsWith('.htm') && !obj.name.endsWith('.md')
|
||||||
|
&& !obj.name.endsWith('.docx') && !obj.name.endsWith('.pdf') && !obj.name.endsWith('.txt')
|
||||||
|
&& !obj.name.endsWith('.doc') && !obj.name.endsWith('.pptx') && !obj.name.endsWith('.ppt')
|
||||||
|
|
||||||
|
) {
|
||||||
|
|
||||||
|
download = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (download) {
|
if (download) {
|
||||||
await minioClient.fGetObject(bucketName, obj.name, itemPath);
|
await minioClient.fGetObject(bucketName, obj.name, itemPath);
|
||||||
await fs.utimes(itemPath, new Date(), new Date(obj.lastModified));
|
await fs.utimes(itemPath, new Date(), new Date(obj.lastModified));
|
||||||
|
|
@ -673,9 +686,20 @@ export class GBDeployer implements IGBDeployer {
|
||||||
|
|
||||||
if (GBConfigService.get('GB_MODE') === 'local') {
|
if (GBConfigService.get('GB_MODE') === 'local') {
|
||||||
const filePath = path.join(GBConfigService.get('STORAGE_LIBRARY'), gbai, packageName);
|
const filePath = path.join(GBConfigService.get('STORAGE_LIBRARY'), gbai, packageName);
|
||||||
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder);
|
if (packageType === '.gbdrive' || packageType === '.gbdata') {
|
||||||
|
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, true);
|
||||||
|
}else {
|
||||||
|
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, false);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName);
|
|
||||||
|
if (packageType === '.gbdrive' || packageType === '.gbdata') {
|
||||||
|
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName, undefined, undefined, true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -711,6 +735,10 @@ export class GBDeployer implements IGBDeployer {
|
||||||
// Deploy platform packages here accordingly to their extension.
|
// Deploy platform packages here accordingly to their extension.
|
||||||
|
|
||||||
switch (packageType) {
|
switch (packageType) {
|
||||||
|
case '.gbdrive':
|
||||||
|
break;
|
||||||
|
case '.gbdata':
|
||||||
|
break;
|
||||||
case '.gbot':
|
case '.gbot':
|
||||||
// Extracts configuration information from .gbot files.
|
// Extracts configuration information from .gbot files.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1148,7 +1148,7 @@ export class KBService implements IGBKBService {
|
||||||
const logoPath = path.join(packagePath, 'cache', logoFilename);
|
const logoPath = path.join(packagePath, 'cache', logoFilename);
|
||||||
await (image as any).write(logoPath);
|
await (image as any).write(logoPath);
|
||||||
await min.core['setConfig'](min, 'Logo', logoFilename);
|
await min.core['setConfig'](min, 'Logo', logoFilename);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract dominant colors from the screenshot
|
// Extract dominant colors from the screenshot
|
||||||
|
|
@ -1179,7 +1179,6 @@ export class KBService implements IGBKBService {
|
||||||
files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page, websiteIgnoreUrls, maxDocuments));
|
files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page, websiteIgnoreUrls, maxDocuments));
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`);
|
GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`);
|
||||||
|
|
@ -1200,7 +1199,7 @@ export class KBService implements IGBKBService {
|
||||||
try {
|
try {
|
||||||
const document = await this.loadAndSplitFile(file);
|
const document = await this.loadAndSplitFile(file);
|
||||||
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
|
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
|
||||||
await min['vectorStore'].addDocuments(flattenedDocuments);
|
// await min['vectorStore'].addDocuments(flattenedDocuments);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
|
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
|
||||||
}
|
}
|
||||||
|
|
@ -1211,16 +1210,37 @@ export class KBService implements IGBKBService {
|
||||||
|
|
||||||
files = await walkPromise(urlJoin(localPath, 'docs'));
|
files = await walkPromise(urlJoin(localPath, 'docs'));
|
||||||
|
|
||||||
|
// const gbdrive = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdrive'));
|
||||||
|
// files = files.concat(await walkPromise(gbdrive));
|
||||||
|
|
||||||
|
const gbdata = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdata'));
|
||||||
|
files = files.concat(await walkPromise(gbdata));
|
||||||
|
|
||||||
|
|
||||||
if (files[0]) {
|
if (files[0]) {
|
||||||
shouldSave = true;
|
shouldSave = true;
|
||||||
GBLogEx.info(min, `Add embeddings from .gbkb: ${files.length} files being processed...`);
|
GBLogEx.info(min, `Add embeddings from packages, ${files.length} files being processed...`);
|
||||||
await CollectionUtil.asyncForEach(files, async file => {
|
await CollectionUtil.asyncForEach(files, async file => {
|
||||||
let content = null;
|
let content = null;
|
||||||
let filePath = path.join(file.root, file.name);
|
let filePath = path.join(file.root, file.name);
|
||||||
|
try {
|
||||||
|
|
||||||
const document = await this.loadAndSplitFile(filePath);
|
if (file.name.endsWith('.csv')) {
|
||||||
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
|
// Read first 1000 lines of CSV file
|
||||||
await min['vectorStore'].addDocuments(flattenedDocuments);
|
const csvContent = await fs.readFile(filePath, 'utf8');
|
||||||
|
const lines = csvContent.split('\n').slice(0, 200).join('\n');
|
||||||
|
await fs.writeFile(filePath, lines, 'utf8');
|
||||||
|
content = lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
const document = await this.loadAndSplitFile(filePath);
|
||||||
|
// TODO: Add full filename.
|
||||||
|
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
|
||||||
|
await min['vectorStore'].addDocuments(flattenedDocuments);
|
||||||
|
GBLogEx.info(min, `Added ${filePath} to vector store.`);
|
||||||
|
} catch (error) {
|
||||||
|
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
if (shouldSave && min['vectorStore']) {
|
if (shouldSave && min['vectorStore']) {
|
||||||
|
|
|
||||||
|
|
@ -132,6 +132,8 @@ export class GBLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
|
||||||
|
|
||||||
let { sources, text } = res;
|
let { sources, text } = res;
|
||||||
|
|
||||||
|
let securityEnabled = false;
|
||||||
|
|
||||||
if (!sources) {
|
if (!sources) {
|
||||||
|
|
||||||
GBLogEx.verbose(this.min, `LLM JSON output sources is NULL.`);
|
GBLogEx.verbose(this.min, `LLM JSON output sources is NULL.`);
|
||||||
|
|
@ -139,6 +141,12 @@ export class GBLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
|
||||||
else {
|
else {
|
||||||
await CollectionUtil.asyncForEach(sources, async source => {
|
await CollectionUtil.asyncForEach(sources, async source => {
|
||||||
let found = false;
|
let found = false;
|
||||||
|
|
||||||
|
if (securityEnabled) {
|
||||||
|
GBLogEx.info(this.min, `LLM JSON output security enabled.`);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
if (source && source.file.endsWith('.pdf')) {
|
if (source && source.file.endsWith('.pdf')) {
|
||||||
const gbaiName = GBUtil.getGBAIPath(this.min.botId, 'gbkb');
|
const gbaiName = GBUtil.getGBAIPath(this.min.botId, 'gbkb');
|
||||||
const localName = path.join(process.env.PWD, 'work', gbaiName, 'docs', source.file);
|
const localName = path.join(process.env.PWD, 'work', gbaiName, 'docs', source.file);
|
||||||
|
|
@ -181,7 +189,7 @@ export class ChatServices {
|
||||||
if (sanitizedQuestion === '' || !vectorStore) {
|
if (sanitizedQuestion === '' || !vectorStore) {
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments * 10);
|
let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments );
|
||||||
const uniqueDocuments = {};
|
const uniqueDocuments = {};
|
||||||
const MAX_DOCUMENTS = numDocuments;
|
const MAX_DOCUMENTS = numDocuments;
|
||||||
|
|
||||||
|
|
|
||||||
65
src/util.ts
65
src/util.ts
|
|
@ -12,7 +12,7 @@ import SwaggerClient from 'swagger-client';
|
||||||
import fs from 'fs/promises';
|
import fs from 'fs/promises';
|
||||||
import { GBConfigService } from '../packages/core.gbapp/services/GBConfigService.js';
|
import { GBConfigService } from '../packages/core.gbapp/services/GBConfigService.js';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import bcrypt from 'bcrypt';
|
import bcrypt from 'bcrypt';
|
||||||
const saltRounds = 10; // The higher the number, the more secure but slower
|
const saltRounds = 10; // The higher the number, the more secure but slower
|
||||||
import { VerbosityLevel, getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
import { VerbosityLevel, getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||||
import urljoin from 'url-join';
|
import urljoin from 'url-join';
|
||||||
|
|
@ -30,7 +30,7 @@ import { QueryTypes } from '@sequelize/core';
|
||||||
*/
|
*/
|
||||||
export class GBUtil {
|
export class GBUtil {
|
||||||
|
|
||||||
|
|
||||||
// When creating/updating a user (hashing before saving to DB)
|
// When creating/updating a user (hashing before saving to DB)
|
||||||
public static async hashPassword(password) {
|
public static async hashPassword(password) {
|
||||||
try {
|
try {
|
||||||
|
|
@ -41,7 +41,7 @@ export class GBUtil {
|
||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// When comparing passwords (like during login)
|
// When comparing passwords (like during login)
|
||||||
public static async comparePassword(inputPassword, hashedPassword) {
|
public static async comparePassword(inputPassword, hashedPassword) {
|
||||||
try {
|
try {
|
||||||
|
|
@ -221,7 +221,7 @@ export class GBUtil {
|
||||||
* @param {string} dest - The destination path.
|
* @param {string} dest - The destination path.
|
||||||
* @returns {Promise<void>} A promise that resolves when the copy operation is complete.
|
* @returns {Promise<void>} A promise that resolves when the copy operation is complete.
|
||||||
*/
|
*/
|
||||||
public static async copyIfNewerRecursive(src: string, dest: string): Promise<void> {
|
public static async copyIfNewerRecursive(src: string, dest: string, onlyTextFiles): Promise<void> {
|
||||||
// Check if the source exists
|
// Check if the source exists
|
||||||
if (!(await GBUtil.exists(src))) {
|
if (!(await GBUtil.exists(src))) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -242,22 +242,37 @@ export class GBUtil {
|
||||||
const destEntry = path.join(dest, entry);
|
const destEntry = path.join(dest, entry);
|
||||||
|
|
||||||
// Recursively copy each entry
|
// Recursively copy each entry
|
||||||
await this.copyIfNewerRecursive(srcEntry, destEntry);
|
await this.copyIfNewerRecursive(srcEntry, destEntry ,onlyTextFiles);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Source is a file, check if we need to copy it
|
|
||||||
if (await GBUtil.exists(dest)) {
|
|
||||||
const srcStat = await fs.stat(src);
|
|
||||||
const destStat = await fs.stat(dest);
|
|
||||||
|
|
||||||
// Copy only if the source file is newer than the destination file
|
let skip = false;
|
||||||
if (srcStat.mtime > destStat.mtime) {
|
|
||||||
|
if (onlyTextFiles && !(
|
||||||
|
src.endsWith('.txt') || src.endsWith('.json')
|
||||||
|
|| src.endsWith('.csv') || src.endsWith('.xlsx') || src.endsWith('.xls')
|
||||||
|
|| src.endsWith('.xlsm') || src.endsWith('.xlsb') || src.endsWith('.xml')
|
||||||
|
|| src.endsWith('.html') || src.endsWith('.htm') || src.endsWith('.md')
|
||||||
|
|| src.endsWith('.docx') || src.endsWith('.pdf')
|
||||||
|
|| src.endsWith('.doc') || src.endsWith('.pptx') || src.endsWith('.ppt'))) {
|
||||||
|
skip = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!skip) {
|
||||||
|
// Source is a file, check if we need to copy it
|
||||||
|
if (await GBUtil.exists(dest)) {
|
||||||
|
const srcStat = await fs.stat(src);
|
||||||
|
const destStat = await fs.stat(dest);
|
||||||
|
// Copy only if the source file is newer than the destination file
|
||||||
|
if (srcStat.mtime > destStat.mtime) {
|
||||||
|
await fs.cp(src, dest, { force: true });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Destination file doesn't exist, so copy it
|
||||||
await fs.cp(src, dest, { force: true });
|
await fs.cp(src, dest, { force: true });
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Destination file doesn't exist, so copy it
|
|
||||||
await fs.cp(src, dest, { force: true });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -392,32 +407,32 @@ export class GBUtil {
|
||||||
/^index$/i,
|
/^index$/i,
|
||||||
/^table of contents$/i,
|
/^table of contents$/i,
|
||||||
];
|
];
|
||||||
|
|
||||||
// Check if page is mostly dots, numbers or blank
|
// Check if page is mostly dots, numbers or blank
|
||||||
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
|
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
|
||||||
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
|
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
|
||||||
const isBlankPage = text.trim().length === 0;
|
const isBlankPage = text.trim().length === 0;
|
||||||
|
|
||||||
// Check if page has actual content
|
// Check if page has actual content
|
||||||
const wordCount = text.trim().split(/\s+/).length;
|
const wordCount = text.trim().split(/\s+/).length;
|
||||||
const hasMinimalContent = wordCount > 10;
|
const hasMinimalContent = wordCount > 10;
|
||||||
|
|
||||||
// Check if page matches any non-content patterns
|
// Check if page matches any non-content patterns
|
||||||
const isNonContent = nonContentPatterns.some(pattern =>
|
const isNonContent = nonContentPatterns.some(pattern =>
|
||||||
pattern.test(text.trim())
|
pattern.test(text.trim())
|
||||||
);
|
);
|
||||||
|
|
||||||
// Page is valid content if:
|
// Page is valid content if:
|
||||||
// - Not mostly dots/numbers/blank
|
// - Not mostly dots/numbers/blank
|
||||||
// - Has minimal word count
|
// - Has minimal word count
|
||||||
// - Doesn't match non-content patterns
|
// - Doesn't match non-content patterns
|
||||||
return !isDotLeaderPage &&
|
return !isDotLeaderPage &&
|
||||||
!isNumbersPage &&
|
!isNumbersPage &&
|
||||||
!isBlankPage &&
|
!isBlankPage &&
|
||||||
hasMinimalContent &&
|
hasMinimalContent &&
|
||||||
!isNonContent;
|
!isNonContent;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue