feat(KeywordsExpressions, DialogKeywords, AdminDialog, GBDeployer, KBService): enhance functionality and improve file handling
All checks were successful
GBCI / build (push) Successful in 1m20s
All checks were successful
GBCI / build (push) Successful in 1m20s
This commit is contained in:
parent
1ef6cbb032
commit
f078881b0d
7 changed files with 122 additions and 38 deletions
|
@ -293,6 +293,8 @@ class AdminDialog extends IGBDialog {
|
|||
await min.conversationalService.sendText(min, step, `Starting publishing for ${botId} packages...`);
|
||||
packages.push(`${botId}.gbot`);
|
||||
packages.push(`${botId}.gbtheme`);
|
||||
packages.push(`${botId}.gbdrive`);
|
||||
packages.push(`${botId}.gbdata`);
|
||||
packages.push(`${botId}.gbkb`);
|
||||
packages.push(`${botId}.gbdialog`);
|
||||
skipError = true;
|
||||
|
@ -305,6 +307,8 @@ class AdminDialog extends IGBDialog {
|
|||
|
||||
if (
|
||||
packageName.toLowerCase() === 'gbdialog' ||
|
||||
packageName.toLowerCase() === 'gbdrive' ||
|
||||
packageName.toLowerCase() === 'gbdata' ||
|
||||
packageName.toLowerCase() === 'gbkb' ||
|
||||
packageName.toLowerCase() === 'gbot' ||
|
||||
packageName.toLowerCase() === 'gbtheme'
|
||||
|
|
|
@ -721,8 +721,9 @@ export class DialogKeywords {
|
|||
proc.roles = role;
|
||||
|
||||
// Checks access.
|
||||
|
||||
const filters = ['People.xlsx', `${role}=x`, `id=${user.userSystemId}`];
|
||||
|
||||
const file = process.env.GB_MODE === 'legacy' ? 'People.xlsx' : 'people.csv';
|
||||
const filters = [file, `${role}=x`, `id=${user.userSystemId}`];
|
||||
const people = await sys.find({ pid, handle: null, args: filters });
|
||||
|
||||
if (!people) {
|
||||
|
|
|
@ -522,6 +522,14 @@ export class KeywordsExpressions {
|
|||
}
|
||||
];
|
||||
|
||||
keywords[i++] = [
|
||||
/^\s*(LOG)(\s*)(.*)/gim,
|
||||
($0, $1, $2, $3) => {
|
||||
const params = this.getParams($3, ['obj']);
|
||||
return `await sys.log ({pid: pid, ${params}})`;
|
||||
}
|
||||
];
|
||||
|
||||
keywords[i++] = [
|
||||
/^\s*(.*)\=\s*(DIR)(\s*)(.*)/gim,
|
||||
($0, $1, $2, $3, $4) => {
|
||||
|
|
|
@ -510,7 +510,7 @@ export class GBDeployer implements IGBDeployer {
|
|||
localPath: string,
|
||||
remotePath: string,
|
||||
baseUrl: string = null,
|
||||
client = null
|
||||
client = null, onlyTextFiles = false
|
||||
): Promise<any> {
|
||||
const storageMode = process.env.GB_MODE;
|
||||
|
||||
|
@ -547,6 +547,19 @@ export class GBDeployer implements IGBDeployer {
|
|||
}
|
||||
}
|
||||
|
||||
if (onlyTextFiles && !obj.name.endsWith('.txt') || !obj.name.endsWith('.json')
|
||||
&& !obj.name.endsWith('.csv') && !obj.name.endsWith('.xlsx') && !obj.name.endsWith('.xls')
|
||||
&& !obj.name.endsWith('.xlsm') && !obj.name.endsWith('.xlsb') && !obj.name.endsWith('.xml')
|
||||
&& !obj.name.endsWith('.html') && !obj.name.endsWith('.htm') && !obj.name.endsWith('.md')
|
||||
&& !obj.name.endsWith('.docx') && !obj.name.endsWith('.pdf') && !obj.name.endsWith('.txt')
|
||||
&& !obj.name.endsWith('.doc') && !obj.name.endsWith('.pptx') && !obj.name.endsWith('.ppt')
|
||||
|
||||
) {
|
||||
|
||||
download = false;
|
||||
}
|
||||
|
||||
|
||||
if (download) {
|
||||
await minioClient.fGetObject(bucketName, obj.name, itemPath);
|
||||
await fs.utimes(itemPath, new Date(), new Date(obj.lastModified));
|
||||
|
@ -673,9 +686,20 @@ export class GBDeployer implements IGBDeployer {
|
|||
|
||||
if (GBConfigService.get('GB_MODE') === 'local') {
|
||||
const filePath = path.join(GBConfigService.get('STORAGE_LIBRARY'), gbai, packageName);
|
||||
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder);
|
||||
if (packageType === '.gbdrive' || packageType === '.gbdata') {
|
||||
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, true);
|
||||
}else {
|
||||
await GBUtil.copyIfNewerRecursive(filePath, packageWorkFolder, false);
|
||||
}
|
||||
} else {
|
||||
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName);
|
||||
|
||||
if (packageType === '.gbdrive' || packageType === '.gbdata') {
|
||||
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName, undefined, undefined, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
await this.downloadFolder(min, path.join('work', `${gbai}`), packageName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -711,6 +735,10 @@ export class GBDeployer implements IGBDeployer {
|
|||
// Deploy platform packages here accordingly to their extension.
|
||||
|
||||
switch (packageType) {
|
||||
case '.gbdrive':
|
||||
break;
|
||||
case '.gbdata':
|
||||
break;
|
||||
case '.gbot':
|
||||
// Extracts configuration information from .gbot files.
|
||||
|
||||
|
|
|
@ -1148,7 +1148,7 @@ export class KBService implements IGBKBService {
|
|||
const logoPath = path.join(packagePath, 'cache', logoFilename);
|
||||
await (image as any).write(logoPath);
|
||||
await min.core['setConfig'](min, 'Logo', logoFilename);
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Extract dominant colors from the screenshot
|
||||
|
@ -1179,7 +1179,6 @@ export class KBService implements IGBKBService {
|
|||
files = files.concat(await this.crawl(min, website, visited, 0, maxDepth, page, websiteIgnoreUrls, maxDocuments));
|
||||
|
||||
await browser.close();
|
||||
|
||||
|
||||
|
||||
GBLogEx.info(min, `Vectorizing ${files.length} file(s)...`);
|
||||
|
@ -1200,7 +1199,7 @@ export class KBService implements IGBKBService {
|
|||
try {
|
||||
const document = await this.loadAndSplitFile(file);
|
||||
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
|
||||
await min['vectorStore'].addDocuments(flattenedDocuments);
|
||||
// await min['vectorStore'].addDocuments(flattenedDocuments);
|
||||
} catch (error) {
|
||||
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
|
||||
}
|
||||
|
@ -1211,16 +1210,37 @@ export class KBService implements IGBKBService {
|
|||
|
||||
files = await walkPromise(urlJoin(localPath, 'docs'));
|
||||
|
||||
// const gbdrive = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdrive'));
|
||||
// files = files.concat(await walkPromise(gbdrive));
|
||||
|
||||
const gbdata = path.join(process.env.PWD, 'work', GBUtil.getGBAIPath(min.botId, 'gbdata'));
|
||||
files = files.concat(await walkPromise(gbdata));
|
||||
|
||||
|
||||
if (files[0]) {
|
||||
shouldSave = true;
|
||||
GBLogEx.info(min, `Add embeddings from .gbkb: ${files.length} files being processed...`);
|
||||
GBLogEx.info(min, `Add embeddings from packages, ${files.length} files being processed...`);
|
||||
await CollectionUtil.asyncForEach(files, async file => {
|
||||
let content = null;
|
||||
let filePath = path.join(file.root, file.name);
|
||||
try {
|
||||
|
||||
const document = await this.loadAndSplitFile(filePath);
|
||||
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
|
||||
await min['vectorStore'].addDocuments(flattenedDocuments);
|
||||
if (file.name.endsWith('.csv')) {
|
||||
// Read first 1000 lines of CSV file
|
||||
const csvContent = await fs.readFile(filePath, 'utf8');
|
||||
const lines = csvContent.split('\n').slice(0, 200).join('\n');
|
||||
await fs.writeFile(filePath, lines, 'utf8');
|
||||
content = lines;
|
||||
}
|
||||
|
||||
const document = await this.loadAndSplitFile(filePath);
|
||||
// TODO: Add full filename.
|
||||
const flattenedDocuments = document.reduce((acc, val) => acc.concat(val), []);
|
||||
await min['vectorStore'].addDocuments(flattenedDocuments);
|
||||
GBLogEx.info(min, `Added ${filePath} to vector store.`);
|
||||
} catch (error) {
|
||||
GBLogEx.info(min, `Ignore processing of ${file}. ${GBUtil.toYAML(error)}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (shouldSave && min['vectorStore']) {
|
||||
|
|
|
@ -132,6 +132,8 @@ export class GBLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
|
|||
|
||||
let { sources, text } = res;
|
||||
|
||||
let securityEnabled = false;
|
||||
|
||||
if (!sources) {
|
||||
|
||||
GBLogEx.verbose(this.min, `LLM JSON output sources is NULL.`);
|
||||
|
@ -139,6 +141,12 @@ export class GBLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
|
|||
else {
|
||||
await CollectionUtil.asyncForEach(sources, async source => {
|
||||
let found = false;
|
||||
|
||||
if (securityEnabled) {
|
||||
GBLogEx.info(this.min, `LLM JSON output security enabled.`);
|
||||
|
||||
}
|
||||
|
||||
if (source && source.file.endsWith('.pdf')) {
|
||||
const gbaiName = GBUtil.getGBAIPath(this.min.botId, 'gbkb');
|
||||
const localName = path.join(process.env.PWD, 'work', gbaiName, 'docs', source.file);
|
||||
|
@ -181,7 +189,7 @@ export class ChatServices {
|
|||
if (sanitizedQuestion === '' || !vectorStore) {
|
||||
return '';
|
||||
}
|
||||
let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments * 10);
|
||||
let documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments );
|
||||
const uniqueDocuments = {};
|
||||
const MAX_DOCUMENTS = numDocuments;
|
||||
|
||||
|
|
65
src/util.ts
65
src/util.ts
|
@ -12,7 +12,7 @@ import SwaggerClient from 'swagger-client';
|
|||
import fs from 'fs/promises';
|
||||
import { GBConfigService } from '../packages/core.gbapp/services/GBConfigService.js';
|
||||
import path from 'path';
|
||||
import bcrypt from 'bcrypt';
|
||||
import bcrypt from 'bcrypt';
|
||||
const saltRounds = 10; // The higher the number, the more secure but slower
|
||||
import { VerbosityLevel, getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
import urljoin from 'url-join';
|
||||
|
@ -30,7 +30,7 @@ import { QueryTypes } from '@sequelize/core';
|
|||
*/
|
||||
export class GBUtil {
|
||||
|
||||
|
||||
|
||||
// When creating/updating a user (hashing before saving to DB)
|
||||
public static async hashPassword(password) {
|
||||
try {
|
||||
|
@ -41,7 +41,7 @@ export class GBUtil {
|
|||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// When comparing passwords (like during login)
|
||||
public static async comparePassword(inputPassword, hashedPassword) {
|
||||
try {
|
||||
|
@ -221,7 +221,7 @@ export class GBUtil {
|
|||
* @param {string} dest - The destination path.
|
||||
* @returns {Promise<void>} A promise that resolves when the copy operation is complete.
|
||||
*/
|
||||
public static async copyIfNewerRecursive(src: string, dest: string): Promise<void> {
|
||||
public static async copyIfNewerRecursive(src: string, dest: string, onlyTextFiles): Promise<void> {
|
||||
// Check if the source exists
|
||||
if (!(await GBUtil.exists(src))) {
|
||||
return;
|
||||
|
@ -242,22 +242,37 @@ export class GBUtil {
|
|||
const destEntry = path.join(dest, entry);
|
||||
|
||||
// Recursively copy each entry
|
||||
await this.copyIfNewerRecursive(srcEntry, destEntry);
|
||||
await this.copyIfNewerRecursive(srcEntry, destEntry ,onlyTextFiles);
|
||||
}
|
||||
} else {
|
||||
// Source is a file, check if we need to copy it
|
||||
if (await GBUtil.exists(dest)) {
|
||||
const srcStat = await fs.stat(src);
|
||||
const destStat = await fs.stat(dest);
|
||||
|
||||
// Copy only if the source file is newer than the destination file
|
||||
if (srcStat.mtime > destStat.mtime) {
|
||||
let skip = false;
|
||||
|
||||
if (onlyTextFiles && !(
|
||||
src.endsWith('.txt') || src.endsWith('.json')
|
||||
|| src.endsWith('.csv') || src.endsWith('.xlsx') || src.endsWith('.xls')
|
||||
|| src.endsWith('.xlsm') || src.endsWith('.xlsb') || src.endsWith('.xml')
|
||||
|| src.endsWith('.html') || src.endsWith('.htm') || src.endsWith('.md')
|
||||
|| src.endsWith('.docx') || src.endsWith('.pdf')
|
||||
|| src.endsWith('.doc') || src.endsWith('.pptx') || src.endsWith('.ppt'))) {
|
||||
skip = true;
|
||||
}
|
||||
|
||||
if (!skip) {
|
||||
// Source is a file, check if we need to copy it
|
||||
if (await GBUtil.exists(dest)) {
|
||||
const srcStat = await fs.stat(src);
|
||||
const destStat = await fs.stat(dest);
|
||||
// Copy only if the source file is newer than the destination file
|
||||
if (srcStat.mtime > destStat.mtime) {
|
||||
await fs.cp(src, dest, { force: true });
|
||||
}
|
||||
} else {
|
||||
// Destination file doesn't exist, so copy it
|
||||
await fs.cp(src, dest, { force: true });
|
||||
}
|
||||
} else {
|
||||
// Destination file doesn't exist, so copy it
|
||||
await fs.cp(src, dest, { force: true });
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -392,32 +407,32 @@ export class GBUtil {
|
|||
/^index$/i,
|
||||
/^table of contents$/i,
|
||||
];
|
||||
|
||||
|
||||
// Check if page is mostly dots, numbers or blank
|
||||
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
|
||||
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
|
||||
const isBlankPage = text.trim().length === 0;
|
||||
|
||||
|
||||
// Check if page has actual content
|
||||
const wordCount = text.trim().split(/\s+/).length;
|
||||
const hasMinimalContent = wordCount > 10;
|
||||
|
||||
|
||||
// Check if page matches any non-content patterns
|
||||
const isNonContent = nonContentPatterns.some(pattern =>
|
||||
const isNonContent = nonContentPatterns.some(pattern =>
|
||||
pattern.test(text.trim())
|
||||
);
|
||||
|
||||
|
||||
// Page is valid content if:
|
||||
// - Not mostly dots/numbers/blank
|
||||
// - Has minimal word count
|
||||
// - Doesn't match non-content patterns
|
||||
return !isDotLeaderPage &&
|
||||
!isNumbersPage &&
|
||||
!isBlankPage &&
|
||||
hasMinimalContent &&
|
||||
!isNonContent;
|
||||
return !isDotLeaderPage &&
|
||||
!isNumbersPage &&
|
||||
!isBlankPage &&
|
||||
hasMinimalContent &&
|
||||
!isNonContent;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue