feat(KeywordsExpressions, DialogKeywords, AdminDialog, GBDeployer, KBService): enhance functionality and improve file handling
All checks were successful
GBCI / build (push) Successful in 1m20s

This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2025-05-11 10:09:32 -03:00
parent 1ef6cbb032
commit f078881b0d
7 changed files with 122 additions and 38 deletions

View file

@ -293,6 +293,8 @@ class AdminDialog extends IGBDialog {
await min.conversationalService.sendText(min, step, `Starting publishing for ${botId} packages...`);
packages.push(`${botId}.gbot`);
packages.push(`${botId}.gbtheme`);
packages.push(`${botId}.gbdrive`);
packages.push(`${botId}.gbdata`);
packages.push(`${botId}.gbkb`);
packages.push(`${botId}.gbdialog`);
skipError = true;
@ -305,6 +307,8 @@ class AdminDialog extends IGBDialog {
if (
packageName.toLowerCase() === 'gbdialog' ||
packageName.toLowerCase() === 'gbdrive' ||
packageName.toLowerCase() === 'gbdata' ||
packageName.toLowerCase() === 'gbkb' ||
packageName.toLowerCase() === 'gbot' ||
packageName.toLowerCase() === 'gbtheme'

View file

@ -721,8 +721,9 @@ export class DialogKeywords {
proc.roles = role;
// Checks access.
const filters = ['People.xlsx', `${role}=x`, `id=${user.userSystemId}`];
const file = process.env.GB_MODE === 'legacy' ? 'People.xlsx' : 'people.csv';
const filters = [file, `${role}=x`, `id=${user.userSystemId}`];
const people = await sys.find({ pid, handle: null, args: filters });
if (!people) {

View file

@ -522,6 +522,14 @@ export class KeywordsExpressions {
}
];
keywords[i++] = [
/^\s*(LOG)(\s*)(.*)/gim,
($0, $1, $2, $3) => {
const params = this.getParams($3, ['obj']);
return `await sys.log ({pid: pid, ${params}})`;
}
];
keywords[i++] = [
/^\s*(.*)\=\s*(DIR)(\s*)(.*)/gim,
($0, $1, $2, $3, $4) => {

View file

@ -510,7 +510,7 @@ export class GBDeployer implements IGBDeployer {
localPath: string,
remotePath: string,
baseUrl: string = null,
client = null
client = null, onlyTextFiles = false
): Promise<any> {
const storageMode = process.env.GB_MODE;
@ -547,6 +547,19 @@ export class GBDeployer implements IGBDeployer {
}
}
if (onlyTextFiles && !obj.name.endsWith('.txt') || !obj.name.endsWith('.json')
&& !obj.name.endsWith('.csv') && !obj.name.endsWith('.xlsx') && !obj.name.endsWith('.xls')
&& !obj.name.endsWith('.xlsm') && !obj.name.endsWith('.xlsb') && !obj.name.endsWith('.xml')
&& !obj.name.endsWith('.html') && !obj.name.endsWith('.htm') && !obj.name.endsWith('.md')
&& !obj.name.endsWith('.docx') && !obj.name.endsWith('.pdf') && !obj.name.endsWith('.txt')
&& !obj.name.endsWith('.doc') && !obj.name.endsWith('.pptx') && !obj.name.endsWith('.ppt')
) {
download = false;
}
if (download) {
await minioClient.fGetObject(bucketName, obj.name, itemPath);
await fs.utimes(itemPath, new Date(), new Date(obj.lastModified));
@ -673,9 +686,20 @@ export class GBDeployer implements IGBDeployer {
if (GBConfigService.get('GB_MODE') === 'local') {
const filePath = path.join(GBConfigService.get('STORAGE_LIBRARY'), gbai, packageName);
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder);
if (packageType === '.gbdrive' || packageType === '.gbdata') {
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, true);
}else {
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, false);
}
} else {
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName);
if (packageType === '.gbdrive' || packageType === '.gbdata') {
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName, undefined, undefined, true);
}
else
{
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName);
}
}
}
@ -711,6 +735,10 @@ export class GBDeployer implements IGBDeployer {
// Deploy platform packages here accordingly to their extension.
switch (packageType) {
case '.gbdrive':
break;
case '.gbdata':
break;
case '.gbot':
// Extracts configuration information from .gbot files.

View file

@ -1148,7 +1148,7 @@ export class KBService implements IGBKBService {
const logoPath = path.join(packagePath, 'cache', logoFilename);
await (image as any).write(logoPath);
await min.core['setConfig'](min, 'Logo', logoFilename);
}
// Extract dominant colors from the screenshot
@ -1179,7 +1179,6 @@ export class KBService implements IGBKBService {
files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page, websiteIgnoreUrls, maxDocuments));
await browser.close();
GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`);
@ -1200,7 +1199,7 @@ export class KBService implements IGBKBService {
try {
const document = await this.loadAndSplitFile(file);
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
await min['vectorStore'].addDocuments(flattenedDocuments);
// await min['vectorStore'].addDocuments(flattenedDocuments);
} catch (error) {
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
}
@ -1211,16 +1210,37 @@ export class KBService implements IGBKBService {
files = await walkPromise(urlJoin(localPath, 'docs'));
// const gbdrive = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdrive'));
// files = files.concat(await walkPromise(gbdrive));
const gbdata = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdata'));
files = files.concat(await walkPromise(gbdata));
if (files[0]) {
shouldSave = true;
GBLogEx.info(min, `Add embeddings from .gbkb: ${files.length} files being processed...`);
GBLogEx.info(min, `Add embeddings from packages, ${files.length} files being processed...`);
await CollectionUtil.asyncForEach(files, async file => {
let content = null;
let filePath = path.join(file.root, file.name);
try {
const document = await this.loadAndSplitFile(filePath);
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
await min['vectorStore'].addDocuments(flattenedDocuments);
if (file.name.endsWith('.csv')) {
// Read first 1000 lines of CSV file
const csvContent = await fs.readFile(filePath, 'utf8');
const lines = csvContent.split('\n').slice(0, 200).join('\n');
await fs.writeFile(filePath, lines, 'utf8');
content = lines;
}
const document = await this.loadAndSplitFile(filePath);
// TODO: Add full filename.
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
await min['vectorStore'].addDocuments(flattenedDocuments);
GBLogEx.info(min, `Added ${filePath} to vector store.`);
} catch (error) {
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
}
});
}
if (shouldSave && min['vectorStore']) {

View file

@ -132,6 +132,8 @@ export class GBLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
let { sources, text } = res;
let securityEnabled = false;
if (!sources) {
GBLogEx.verbose(this.min, `LLM JSON output sources is NULL.`);
@ -139,6 +141,12 @@ export class GBLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
else {
await CollectionUtil.asyncForEach(sources, async source => {
let found = false;
if (securityEnabled) {
GBLogEx.info(this.min, `LLM JSON output security enabled.`);
}
if (source && source.file.endsWith('.pdf')) {
const gbaiName = GBUtil.getGBAIPath(this.min.botId, 'gbkb');
const localName = path.join(process.env.PWD, 'work', gbaiName, 'docs', source.file);
@ -181,7 +189,7 @@ export class ChatServices {
if (sanitizedQuestion === '' || !vectorStore) {
return '';
}
let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments * 10);
let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments );
const uniqueDocuments = {};
const MAX_DOCUMENTS = numDocuments;

View file

@ -12,7 +12,7 @@ import SwaggerClient from 'swagger-client';
import fs from 'fs/promises';
import { GBConfigService } from '../packages/core.gbapp/services/GBConfigService.js';
import path from 'path';
import bcrypt from 'bcrypt';
import bcrypt from 'bcrypt';
const saltRounds = 10; // The higher the number, the more secure but slower
import { VerbosityLevel, getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
import urljoin from 'url-join';
@ -30,7 +30,7 @@ import { QueryTypes } from '@sequelize/core';
*/
export class GBUtil {
// When creating/updating a user (hashing before saving to DB)
public static async hashPassword(password) {
try {
@ -41,7 +41,7 @@ export class GBUtil {
throw err;
}
}
// When comparing passwords (like during login)
public static async comparePassword(inputPassword, hashedPassword) {
try {
@ -221,7 +221,7 @@ export class GBUtil {
* @param {string} dest - The destination path.
* @returns {Promise<void>} A promise that resolves when the copy operation is complete.
*/
public static async copyIfNewerRecursive(src: string, dest: string): Promise<void> {
public static async copyIfNewerRecursive(src: string, dest: string, onlyTextFiles): Promise<void> {
// Check if the source exists
if (!(await GBUtil.exists(src))) {
return;
@ -242,22 +242,37 @@ export class GBUtil {
const destEntry = path.join(dest, entry);
// Recursively copy each entry
await this.copyIfNewerRecursive(srcEntry, destEntry);
await this.copyIfNewerRecursive(srcEntry, destEntry ,onlyTextFiles);
}
} else {
// Source is a file, check if we need to copy it
if (await GBUtil.exists(dest)) {
const srcStat = await fs.stat(src);
const destStat = await fs.stat(dest);
// Copy only if the source file is newer than the destination file
if (srcStat.mtime > destStat.mtime) {
let skip = false;
if (onlyTextFiles && !(
src.endsWith('.txt') || src.endsWith('.json')
|| src.endsWith('.csv') || src.endsWith('.xlsx') || src.endsWith('.xls')
|| src.endsWith('.xlsm') || src.endsWith('.xlsb') || src.endsWith('.xml')
|| src.endsWith('.html') || src.endsWith('.htm') || src.endsWith('.md')
|| src.endsWith('.docx') || src.endsWith('.pdf')
|| src.endsWith('.doc') || src.endsWith('.pptx') || src.endsWith('.ppt'))) {
skip = true;
}
if (!skip) {
// Source is a file, check if we need to copy it
if (await GBUtil.exists(dest)) {
const srcStat = await fs.stat(src);
const destStat = await fs.stat(dest);
// Copy only if the source file is newer than the destination file
if (srcStat.mtime > destStat.mtime) {
await fs.cp(src, dest, { force: true });
}
} else {
// Destination file doesn't exist, so copy it
await fs.cp(src, dest, { force: true });
}
} else {
// Destination file doesn't exist, so copy it
await fs.cp(src, dest, { force: true });
}
}
}
@ -392,32 +407,32 @@ export class GBUtil {
/^index$/i,
/^table of contents$/i,
];
// Check if page is mostly dots, numbers or blank
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
const isBlankPage = text.trim().length === 0;
// Check if page has actual content
const wordCount = text.trim().split(/\s+/).length;
const hasMinimalContent = wordCount > 10;
// Check if page matches any non-content patterns
const isNonContent = nonContentPatterns.some(pattern =>
const isNonContent = nonContentPatterns.some(pattern =>
pattern.test(text.trim())
);
// Page is valid content if:
// - Not mostly dots/numbers/blank
// - Has minimal word count
// - Doesn't match non-content patterns
return !isDotLeaderPage &&
!isNumbersPage &&
!isBlankPage &&
hasMinimalContent &&
!isNonContent;
return !isDotLeaderPage &&
!isNumbersPage &&
!isBlankPage &&
hasMinimalContent &&
!isNonContent;
}
}