diff --git a/packages/admin.gbapp/dialogs/AdminDialog.ts b/packages/admin.gbapp/dialogs/AdminDialog.ts index 88733a50..d859ceb0 100644 --- a/packages/admin.gbapp/dialogs/AdminDialog.ts +++ b/packages/admin.gbapp/dialogs/AdminDialog.ts @@ -293,6 +293,8 @@ class AdminDialog extends IGBDialog { await min.conversationalService.sendText(min, step, `Starting publishing for ${botId} packages...`); packages.push(`${botId}.gbot`); packages.push(`${botId}.gbtheme`); + packages.push(`${botId}.gbdrive`); + packages.push(`${botId}.gbdata`); packages.push(`${botId}.gbkb`); packages.push(`${botId}.gbdialog`); skipError = true; @@ -305,6 +307,8 @@ class AdminDialog extends IGBDialog { if ( packageName.toLowerCase() === 'gbdialog' || + packageName.toLowerCase() === 'gbdrive' || + packageName.toLowerCase() === 'gbdata' || packageName.toLowerCase() === 'gbkb' || packageName.toLowerCase() === 'gbot' || packageName.toLowerCase() === 'gbtheme' diff --git a/packages/basic.gblib/services/DialogKeywords.ts b/packages/basic.gblib/services/DialogKeywords.ts index 92813675..f2429269 100644 --- a/packages/basic.gblib/services/DialogKeywords.ts +++ b/packages/basic.gblib/services/DialogKeywords.ts @@ -721,8 +721,9 @@ export class DialogKeywords { proc.roles = role; // Checks access. - - const filters = ['People.xlsx', `${role}=x`, `id=${user.userSystemId}`]; + + const file = process.env.GB_MODE === 'legacy' ? 'People.xlsx' : 'people.csv'; + const filters = [file, `${role}=x`, `id=${user.userSystemId}`]; const people = await sys.find({ pid, handle: null, args: filters }); if (!people) { diff --git a/packages/basic.gblib/services/KeywordsExpressions.ts b/packages/basic.gblib/services/KeywordsExpressions.ts index 107a07ee..c6cc8e02 100644 --- a/packages/basic.gblib/services/KeywordsExpressions.ts +++ b/packages/basic.gblib/services/KeywordsExpressions.ts @@ -522,6 +522,14 @@ export class KeywordsExpressions { } ]; + keywords[i++] = [ + /^\s*(LOG)(\s*)(.*)/gim, + ($0, $1, $2, $3) => { + const params = this.getParams($3, ['obj']); + return `await sys.log ({pid: pid, ${params}})`; + } + ]; + keywords[i++] = [ /^\s*(.*)\=\s*(DIR)(\s*)(.*)/gim, ($0, $1, $2, $3, $4) => { diff --git a/packages/core.gbapp/services/GBDeployer.ts b/packages/core.gbapp/services/GBDeployer.ts index d7633602..22f25cc8 100644 --- a/packages/core.gbapp/services/GBDeployer.ts +++ b/packages/core.gbapp/services/GBDeployer.ts @@ -510,7 +510,7 @@ export class GBDeployer implements IGBDeployer { localPath: string, remotePath: string, baseUrl: string = null, - client = null + client = null, onlyTextFiles = false ): Promise { const storageMode = process.env.GB_MODE; @@ -547,6 +547,19 @@ export class GBDeployer implements IGBDeployer { } } + if (onlyTextFiles && !obj.name.endsWith('.txt') || !obj.name.endsWith('.json') + && !obj.name.endsWith('.csv') && !obj.name.endsWith('.xlsx') && !obj.name.endsWith('.xls') + && !obj.name.endsWith('.xlsm') && !obj.name.endsWith('.xlsb') && !obj.name.endsWith('.xml') + && !obj.name.endsWith('.html') && !obj.name.endsWith('.htm') && !obj.name.endsWith('.md') + && !obj.name.endsWith('.docx') && !obj.name.endsWith('.pdf') && !obj.name.endsWith('.txt') + && !obj.name.endsWith('.doc') && !obj.name.endsWith('.pptx') && !obj.name.endsWith('.ppt') + + ) { + + download = false; + } + + if (download) { await minioClient.fGetObject(bucketName, obj.name, itemPath); await fs.utimes(itemPath, new Date(), new Date(obj.lastModified)); @@ -673,9 +686,20 @@ export class GBDeployer implements IGBDeployer { if (GBConfigService.get('GB_MODE') === 'local') { const filePath = path.join(GBConfigService.get('STORAGE_LIBRARY'), gbai, packageName); - await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder); + if (packageType === '.gbdrive' || packageType === '.gbdata') { + await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, true); + }else { + await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, false); + } } else { - await this.downloadFolder(min, path.join('work', `${gbai}`), packageName); + + if (packageType === '.gbdrive' || packageType === '.gbdata') { + await this.downloadFolder(min, path.join('work', `${gbai}`), packageName, undefined, undefined, true); + } + else + { + await this.downloadFolder(min, path.join('work', `${gbai}`), packageName); + } } } @@ -711,6 +735,10 @@ export class GBDeployer implements IGBDeployer { // Deploy platform packages here accordingly to their extension. switch (packageType) { + case '.gbdrive': + break; + case '.gbdata': + break; case '.gbot': // Extracts configuration information from .gbot files. diff --git a/packages/kb.gbapp/services/KBService.ts b/packages/kb.gbapp/services/KBService.ts index a27eadd0..b81cc0b7 100644 --- a/packages/kb.gbapp/services/KBService.ts +++ b/packages/kb.gbapp/services/KBService.ts @@ -1148,7 +1148,7 @@ export class KBService implements IGBKBService { const logoPath = path.join(packagePath, 'cache', logoFilename); await (image as any).write(logoPath); await min.core['setConfig'](min, 'Logo', logoFilename); - + } // Extract dominant colors from the screenshot @@ -1179,7 +1179,6 @@ export class KBService implements IGBKBService { files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page, websiteIgnoreUrls, maxDocuments)); await browser.close(); - GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`); @@ -1200,7 +1199,7 @@ export class KBService implements IGBKBService { try { const document = await this.loadAndSplitFile(file); const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []); - await min['vectorStore'].addDocuments(flattenedDocuments); + // await min['vectorStore'].addDocuments(flattenedDocuments); } catch (error) { GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`); } @@ -1211,16 +1210,37 @@ export class KBService implements IGBKBService { files = await walkPromise(urlJoin(localPath, 'docs')); + // const gbdrive = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdrive')); + // files = files.concat(await walkPromise(gbdrive)); + + const gbdata = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdata')); + files = files.concat(await walkPromise(gbdata)); + + if (files[0]) { shouldSave = true; - GBLogEx.info(min, `Add embeddings from .gbkb: ${files.length} files being processed...`); + GBLogEx.info(min, `Add embeddings from packages, ${files.length} files being processed...`); await CollectionUtil.asyncForEach(files, async file => { let content = null; let filePath = path.join(file.root, file.name); + try { - const document = await this.loadAndSplitFile(filePath); - const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []); - await min['vectorStore'].addDocuments(flattenedDocuments); + if (file.name.endsWith('.csv')) { + // Read first 1000 lines of CSV file + const csvContent = await fs.readFile(filePath, 'utf8'); + const lines = csvContent.split('\n').slice(0, 200).join('\n'); + await fs.writeFile(filePath, lines, 'utf8'); + content = lines; + } + + const document = await this.loadAndSplitFile(filePath); + // TODO: Add full filename. + const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []); + await min['vectorStore'].addDocuments(flattenedDocuments); + GBLogEx.info(min, `Added ${filePath} to vector store.`); + } catch (error) { + GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`); + } }); } if (shouldSave && min['vectorStore']) { diff --git a/packages/llm.gblib/services/ChatServices.ts b/packages/llm.gblib/services/ChatServices.ts index 59934b12..d51d89e5 100644 --- a/packages/llm.gblib/services/ChatServices.ts +++ b/packages/llm.gblib/services/ChatServices.ts @@ -132,6 +132,8 @@ export class GBLLMOutputParser extends BaseLLMOutputParser { let { sources, text } = res; + let securityEnabled = false; + if (!sources) { GBLogEx.verbose(this.min, `LLM JSON output sources is NULL.`); @@ -139,6 +141,12 @@ export class GBLLMOutputParser extends BaseLLMOutputParser { else { await CollectionUtil.asyncForEach(sources, async source => { let found = false; + + if (securityEnabled) { + GBLogEx.info(this.min, `LLM JSON output security enabled.`); + + } + if (source && source.file.endsWith('.pdf')) { const gbaiName = GBUtil.getGBAIPath(this.min.botId, 'gbkb'); const localName = path.join(process.env.PWD, 'work', gbaiName, 'docs', source.file); @@ -181,7 +189,7 @@ export class ChatServices { if (sanitizedQuestion === '' || !vectorStore) { return ''; } - let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments * 10); + let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments ); const uniqueDocuments = {}; const MAX_DOCUMENTS = numDocuments; diff --git a/src/util.ts b/src/util.ts index 5d961dd6..5eef7c2b 100644 --- a/src/util.ts +++ b/src/util.ts @@ -12,7 +12,7 @@ import SwaggerClient from 'swagger-client'; import fs from 'fs/promises'; import { GBConfigService } from '../packages/core.gbapp/services/GBConfigService.js'; import path from 'path'; -import bcrypt from 'bcrypt'; +import bcrypt from 'bcrypt'; const saltRounds = 10; // The higher the number, the more secure but slower import { VerbosityLevel, getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'; import urljoin from 'url-join'; @@ -30,7 +30,7 @@ import { QueryTypes } from '@sequelize/core'; */ export class GBUtil { - + // When creating/updating a user (hashing before saving to DB) public static async hashPassword(password) { try { @@ -41,7 +41,7 @@ export class GBUtil { throw err; } } - + // When comparing passwords (like during login) public static async comparePassword(inputPassword, hashedPassword) { try { @@ -221,7 +221,7 @@ export class GBUtil { * @param {string} dest - The destination path. * @returns {Promise} A promise that resolves when the copy operation is complete. */ - public static async copyIfNewerRecursive(src: string, dest: string): Promise { + public static async copyIfNewerRecursive(src: string, dest: string, onlyTextFiles): Promise { // Check if the source exists if (!(await GBUtil.exists(src))) { return; @@ -242,22 +242,37 @@ export class GBUtil { const destEntry = path.join(dest, entry); // Recursively copy each entry - await this.copyIfNewerRecursive(srcEntry, destEntry); + await this.copyIfNewerRecursive(srcEntry, destEntry ,onlyTextFiles); } } else { - // Source is a file, check if we need to copy it - if (await GBUtil.exists(dest)) { - const srcStat = await fs.stat(src); - const destStat = await fs.stat(dest); - // Copy only if the source file is newer than the destination file - if (srcStat.mtime > destStat.mtime) { + let skip = false; + + if (onlyTextFiles && !( + src.endsWith('.txt') || src.endsWith('.json') + || src.endsWith('.csv') || src.endsWith('.xlsx') || src.endsWith('.xls') + || src.endsWith('.xlsm') || src.endsWith('.xlsb') || src.endsWith('.xml') + || src.endsWith('.html') || src.endsWith('.htm') || src.endsWith('.md') + || src.endsWith('.docx') || src.endsWith('.pdf') + || src.endsWith('.doc') || src.endsWith('.pptx') || src.endsWith('.ppt'))) { + skip = true; + } + + if (!skip) { + // Source is a file, check if we need to copy it + if (await GBUtil.exists(dest)) { + const srcStat = await fs.stat(src); + const destStat = await fs.stat(dest); + // Copy only if the source file is newer than the destination file + if (srcStat.mtime > destStat.mtime) { + await fs.cp(src, dest, { force: true }); + } + } else { + // Destination file doesn't exist, so copy it await fs.cp(src, dest, { force: true }); } - } else { - // Destination file doesn't exist, so copy it - await fs.cp(src, dest, { force: true }); } + } } @@ -392,32 +407,32 @@ export class GBUtil { /^index$/i, /^table of contents$/i, ]; - + // Check if page is mostly dots, numbers or blank const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/); const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/); const isBlankPage = text.trim().length === 0; - + // Check if page has actual content const wordCount = text.trim().split(/\s+/).length; const hasMinimalContent = wordCount > 10; - + // Check if page matches any non-content patterns - const isNonContent = nonContentPatterns.some(pattern => + const isNonContent = nonContentPatterns.some(pattern => pattern.test(text.trim()) ); - + // Page is valid content if: // - Not mostly dots/numbers/blank // - Has minimal word count // - Doesn't match non-content patterns - return !isDotLeaderPage && - !isNumbersPage && - !isBlankPage && - hasMinimalContent && - !isNonContent; + return !isDotLeaderPage && + !isNumbersPage && + !isBlankPage && + hasMinimalContent && + !isNonContent; } - + }