fix(llm.gblib): Talk to data local db use fix.
This commit is contained in:
parent
5b69a12d74
commit
29ddb89d2a
2 changed files with 50 additions and 0 deletions
|
@ -196,11 +196,18 @@ export class ChatServices {
|
||||||
const doc = uniqueDocuments[filePaths];
|
const doc = uniqueDocuments[filePaths];
|
||||||
const metadata = doc.metadata;
|
const metadata = doc.metadata;
|
||||||
const filename = path.basename(metadata.source);
|
const filename = path.basename(metadata.source);
|
||||||
|
|
||||||
|
if (!GBUtil.isContentPage(doc.pageContent)){
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let page = 0;
|
let page = 0;
|
||||||
if (metadata.source.endsWith('.pdf')) {
|
if (metadata.source.endsWith('.pdf')) {
|
||||||
page = await ChatServices.findPageForText(metadata.source, doc.pageContent);
|
page = await ChatServices.findPageForText(metadata.source, doc.pageContent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page ? page : 'entire document'
|
output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page ? page : 'entire document'
|
||||||
}
|
}
|
||||||
(you will fill the JSON sources collection field later),
|
(you will fill the JSON sources collection field later),
|
||||||
|
|
43
src/util.ts
43
src/util.ts
|
@ -360,4 +360,47 @@ export class GBUtil {
|
||||||
const randomDelay = Math.floor(Math.random() * (max - min + 1) + min) * 1000;
|
const randomDelay = Math.floor(Math.random() * (max - min + 1) + min) * 1000;
|
||||||
await new Promise(resolve => setTimeout(resolve, randomDelay));
|
await new Promise(resolve => setTimeout(resolve, randomDelay));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static isContentPage(text: string): boolean {
|
||||||
|
// Common patterns that indicate non-content pages
|
||||||
|
const nonContentPatterns = [
|
||||||
|
/^index$/i,
|
||||||
|
/^contents$/i,
|
||||||
|
/^table of contents$/i,
|
||||||
|
/^appendix/i,
|
||||||
|
/^glossary$/i,
|
||||||
|
/^bibliography$/i,
|
||||||
|
/^references$/i,
|
||||||
|
/^acknowledgments?$/i,
|
||||||
|
/^copyright/i,
|
||||||
|
/^about the author/i
|
||||||
|
];
|
||||||
|
|
||||||
|
// Check if page is mostly dots, numbers or blank
|
||||||
|
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
|
||||||
|
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
|
||||||
|
const isBlankPage = text.trim().length === 0;
|
||||||
|
|
||||||
|
// Check if page has actual content
|
||||||
|
const wordCount = text.trim().split(/\s+/).length;
|
||||||
|
const hasMinimalContent = wordCount > 10;
|
||||||
|
|
||||||
|
// Check if page matches any non-content patterns
|
||||||
|
const isNonContent = nonContentPatterns.some(pattern =>
|
||||||
|
pattern.test(text.trim())
|
||||||
|
);
|
||||||
|
|
||||||
|
// Page is valid content if:
|
||||||
|
// - Not mostly dots/numbers/blank
|
||||||
|
// - Has minimal word count
|
||||||
|
// - Doesn't match non-content patterns
|
||||||
|
return !isDotLeaderPage &&
|
||||||
|
!isNumbersPage &&
|
||||||
|
!isBlankPage &&
|
||||||
|
hasMinimalContent &&
|
||||||
|
!isNonContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue