fix(llm.gblib): Talk to data local db use fix.
This commit is contained in:
parent
5b69a12d74
commit
29ddb89d2a
2 changed files with 50 additions and 0 deletions
|
@ -196,11 +196,18 @@ export class ChatServices {
|
|||
const doc = uniqueDocuments[filePaths];
|
||||
const metadata = doc.metadata;
|
||||
const filename = path.basename(metadata.source);
|
||||
|
||||
if (!GBUtil.isContentPage(doc.pageContent)){
|
||||
continue;
|
||||
}
|
||||
|
||||
let page = 0;
|
||||
if (metadata.source.endsWith('.pdf')) {
|
||||
page = await ChatServices.findPageForText(metadata.source, doc.pageContent);
|
||||
}
|
||||
|
||||
|
||||
|
||||
output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page ? page : 'entire document'
|
||||
}
|
||||
(you will fill the JSON sources collection field later),
|
||||
|
|
43
src/util.ts
43
src/util.ts
|
@ -360,4 +360,47 @@ export class GBUtil {
|
|||
const randomDelay = Math.floor(Math.random() * (max - min + 1) + min) * 1000;
|
||||
await new Promise(resolve => setTimeout(resolve, randomDelay));
|
||||
}
|
||||
|
||||
public static isContentPage(text: string): boolean {
|
||||
// Common patterns that indicate non-content pages
|
||||
const nonContentPatterns = [
|
||||
/^index$/i,
|
||||
/^contents$/i,
|
||||
/^table of contents$/i,
|
||||
/^appendix/i,
|
||||
/^glossary$/i,
|
||||
/^bibliography$/i,
|
||||
/^references$/i,
|
||||
/^acknowledgments?$/i,
|
||||
/^copyright/i,
|
||||
/^about the author/i
|
||||
];
|
||||
|
||||
// Check if page is mostly dots, numbers or blank
|
||||
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
|
||||
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
|
||||
const isBlankPage = text.trim().length === 0;
|
||||
|
||||
// Check if page has actual content
|
||||
const wordCount = text.trim().split(/\s+/).length;
|
||||
const hasMinimalContent = wordCount > 10;
|
||||
|
||||
// Check if page matches any non-content patterns
|
||||
const isNonContent = nonContentPatterns.some(pattern =>
|
||||
pattern.test(text.trim())
|
||||
);
|
||||
|
||||
// Page is valid content if:
|
||||
// - Not mostly dots/numbers/blank
|
||||
// - Has minimal word count
|
||||
// - Doesn't match non-content patterns
|
||||
return !isDotLeaderPage &&
|
||||
!isNumbersPage &&
|
||||
!isBlankPage &&
|
||||
hasMinimalContent &&
|
||||
!isNonContent;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue