fix(llm.gblib): Talk to data local db use fix.

This commit is contained in:
Rodrigo Rodriguez 2024-11-24 12:57:14 -03:00
parent 5b69a12d74
commit 29ddb89d2a
2 changed files with 50 additions and 0 deletions

View file

@ -196,11 +196,18 @@ export class ChatServices {
const doc = uniqueDocuments[filePaths];
const metadata = doc.metadata;
const filename = path.basename(metadata.source);
if (!GBUtil.isContentPage(doc.pageContent)){
continue;
}
let page = 0;
if (metadata.source.endsWith('.pdf')) {
page = await ChatServices.findPageForText(metadata.source, doc.pageContent);
}
output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page ? page : 'entire document'
}
(you will fill the JSON sources collection field later),

View file

@ -360,4 +360,47 @@ export class GBUtil {
const randomDelay = Math.floor(Math.random() * (max - min + 1) + min) * 1000;
await new Promise(resolve => setTimeout(resolve, randomDelay));
}
public static isContentPage(text: string): boolean {
// Common patterns that indicate non-content pages
const nonContentPatterns = [
/^index$/i,
/^contents$/i,
/^table of contents$/i,
/^appendix/i,
/^glossary$/i,
/^bibliography$/i,
/^references$/i,
/^acknowledgments?$/i,
/^copyright/i,
/^about the author/i
];
// Check if page is mostly dots, numbers or blank
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
const isBlankPage = text.trim().length === 0;
// Check if page has actual content
const wordCount = text.trim().split(/\s+/).length;
const hasMinimalContent = wordCount > 10;
// Check if page matches any non-content patterns
const isNonContent = nonContentPatterns.some(pattern =>
pattern.test(text.trim())
);
// Page is valid content if:
// - Not mostly dots/numbers/blank
// - Has minimal word count
// - Doesn't match non-content patterns
return !isDotLeaderPage &&
!isNumbersPage &&
!isBlankPage &&
hasMinimalContent &&
!isNonContent;
}
}