diff --git a/package.json b/package.json index aac583f1..a34d8a3b 100644 --- a/package.json +++ b/package.json @@ -170,6 +170,7 @@ "pdf-extraction": "1.0.2", "pdf-parse": "1.1.1", "pdf-to-png-converter": "3.1.0", + "pdfjs-dist": "4.0.379", "pdfkit": "0.13.0", "phone": "3.1.30", "pizzip": "3.1.3", diff --git a/packages/gpt.gblib/services/ChatServices.ts b/packages/gpt.gblib/services/ChatServices.ts index 8407b142..42ea784c 100644 --- a/packages/gpt.gblib/services/ChatServices.ts +++ b/packages/gpt.gblib/services/ChatServices.ts @@ -48,13 +48,18 @@ import { GBConfigService } from '../../core.gbapp/services/GBConfigService.js'; import { GuaribasSubject } from '../../kb.gbapp/models/index.js'; import { Serialized } from "@langchain/core/load/serializable"; import { BaseCallbackHandler } from "@langchain/core/callbacks/base"; +import { pdfToPng, PngPageOutput } from 'pdf-to-png-converter'; import { DynamicStructuredTool } from "@langchain/core/tools"; import { BaseLLMOutputParser, OutputParserException, } from "@langchain/core/output_parsers"; import { ChatGeneration, Generation } from "@langchain/core/outputs"; -import { LunaryHandler } from "@langchain/community/callbacks/handlers/lunary"; +import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js'; +import { GBServer } from '../../../src/app.js'; +import urlJoin from 'url-join'; +import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs"; + export interface CustomOutputParserFields { } export type ExpectedOutput = string; @@ -126,23 +131,85 @@ export class CustomLLMOutputParser extends BaseLLMOutputParser { export class ChatServices { + private async pdfPageAsImage(min, filename, pageNumber) { + + const data = Fs.readFileSync(filename); + + // Converts the PDF to PNG. + + const pngPages: PngPageOutput[] = await pdfToPng(data, { + disableFontFace: false, + useSystemFonts: false, + viewportScale: 2.0, + pagesToProcess: [1], + strictPagesToProcess: false, + verbosityLevel: 0 + }); + + const gbaiName = DialogKeywords.getGBAIPath(min.botId, 'gbdata'); + + // Prepare an image on cache and return the GBFILE information. + + const localName = Path.join('work', gbaiName, 'cache', `img${GBAdminService.getRndReadableIdentifier()}.png`); + if (pngPages.length > 0) { + const buffer = pngPages[pageNumber - 1].content; + const url = urlJoin(GBServer.globals.publicAddress, min.botId, 'cache', Path.basename(localName)); + Fs.writeFileSync(localName, buffer, { encoding: null }); + return { localName: localName, url: url, data: buffer }; + } + } + private static async getRelevantContext( vectorStore: HNSWLib, sanitizedQuestion: string, numDocuments: number = 10 ): Promise { + if (sanitizedQuestion === '') { return ''; } const documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments); - return documents - .map((doc) => doc.pageContent) - .join(', ') - .trim() - .replaceAll('\n', ' '); + let output = ''; + + await CollectionUtil.asyncForEach(documents, async (doc) => { + + const metadata = doc.metadata; + const filename = Path.basename(metadata.source); + const page = await ChatServices.findPageForTextInterval(doc.metadata.source, + metadata.loc.lines.from, metadata.loc.lines.to); + + output = `${output}\n\n\n\nThe following context is coming from ${filename} at page: ${page}, + memorize this block among document information and return when you are refering this part of content:\n\n\n\n ${doc.pageContent} \n\n\n\n.`; + }); + return output; } + public static async findPageForTextInterval(pdfPath, startLine, endLine) { + const data = new Uint8Array(Fs.readFileSync(pdfPath)); + const pdf = await getDocument({ data }).promise; + + // Loop através de cada página para encontrar o intervalo de texto + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + const text = textContent.items.map(item => item['str']).join('\n'); + + const lines = text.split('\n'); + const numLines = lines.length; + + // Verificar se o intervalo de texto está nesta página + const nextPageLine = (i === pdf.numPages) ? numLines : endLine; + if (startLine <= numLines && endLine >= 0 && startLine <= nextPageLine) { + return i; + } + + startLine -= numLines; + endLine -= numLines; + } + + return -1; // Intervalo de texto não encontrado + } /** * Generate text * @@ -234,6 +301,11 @@ export class ChatServices { const combineDocumentsPrompt = ChatPromptTemplate.fromMessages([ AIMessagePromptTemplate.fromTemplate( ` + This is a sectioned context. + + Very important: When answering, *mention in the answer* the PDF filename and page number related to each block of information used to answer. + Eg.: filename.pdf, page 3 - filename2.pdf, page 55. + \n\n{context}\n\n And using \n\n{chat_history}\n\n @@ -278,7 +350,7 @@ export class ChatServices { }, context: async (output: string) => { const c = await ChatServices.getRelevantContext(docsContext, output); - return `${systemPrompt} \n ${c ? 'Use this context to answer:\n' + c: 'answer just with user question.'}`; + return `${systemPrompt} \n ${c ? 'Use this context to answer:\n' + c : 'answer just with user question.'}`; }, },