fix(core.gbapp): Fix deployer group.

This commit is contained in:
Rodrigo Rodriguez 2024-03-20 00:42:44 -03:00
parent 063c149b94
commit 14b172187e
2 changed files with 80 additions and 7 deletions

View file

@ -170,6 +170,7 @@
"pdf-extraction": "1.0.2", "pdf-extraction": "1.0.2",
"pdf-parse": "1.1.1", "pdf-parse": "1.1.1",
"pdf-to-png-converter": "3.1.0", "pdf-to-png-converter": "3.1.0",
"pdfjs-dist": "4.0.379",
"pdfkit": "0.13.0", "pdfkit": "0.13.0",
"phone": "3.1.30", "phone": "3.1.30",
"pizzip": "3.1.3", "pizzip": "3.1.3",

View file

@ -48,13 +48,18 @@ import { GBConfigService } from '../../core.gbapp/services/GBConfigService.js';
import { GuaribasSubject } from '../../kb.gbapp/models/index.js'; import { GuaribasSubject } from '../../kb.gbapp/models/index.js';
import { Serialized } from "@langchain/core/load/serializable"; import { Serialized } from "@langchain/core/load/serializable";
import { BaseCallbackHandler } from "@langchain/core/callbacks/base"; import { BaseCallbackHandler } from "@langchain/core/callbacks/base";
import { pdfToPng, PngPageOutput } from 'pdf-to-png-converter';
import { DynamicStructuredTool } from "@langchain/core/tools"; import { DynamicStructuredTool } from "@langchain/core/tools";
import { import {
BaseLLMOutputParser, BaseLLMOutputParser,
OutputParserException, OutputParserException,
} from "@langchain/core/output_parsers"; } from "@langchain/core/output_parsers";
import { ChatGeneration, Generation } from "@langchain/core/outputs"; import { ChatGeneration, Generation } from "@langchain/core/outputs";
import { LunaryHandler } from "@langchain/community/callbacks/handlers/lunary"; import { GBAdminService } from '../../admin.gbapp/services/GBAdminService.js';
import { GBServer } from '../../../src/app.js';
import urlJoin from 'url-join';
import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
export interface CustomOutputParserFields { } export interface CustomOutputParserFields { }
export type ExpectedOutput = string; export type ExpectedOutput = string;
@ -126,23 +131,85 @@ export class CustomLLMOutputParser extends BaseLLMOutputParser<ExpectedOutput> {
export class ChatServices { export class ChatServices {
private async pdfPageAsImage(min, filename, pageNumber) {
const data = Fs.readFileSync(filename);
// Converts the PDF to PNG.
const pngPages: PngPageOutput[] = await pdfToPng(data, {
disableFontFace: false,
useSystemFonts: false,
viewportScale: 2.0,
pagesToProcess: [1],
strictPagesToProcess: false,
verbosityLevel: 0
});
const gbaiName = DialogKeywords.getGBAIPath(min.botId, 'gbdata');
// Prepare an image on cache and return the GBFILE information.
const localName = Path.join('work', gbaiName, 'cache', `img${GBAdminService.getRndReadableIdentifier()}.png`);
if (pngPages.length > 0) {
const buffer = pngPages[pageNumber - 1].content;
const url = urlJoin(GBServer.globals.publicAddress, min.botId, 'cache', Path.basename(localName));
Fs.writeFileSync(localName, buffer, { encoding: null });
return { localName: localName, url: url, data: buffer };
}
}
private static async getRelevantContext( private static async getRelevantContext(
vectorStore: HNSWLib, vectorStore: HNSWLib,
sanitizedQuestion: string, sanitizedQuestion: string,
numDocuments: number = 10 numDocuments: number = 10
): Promise<string> { ): Promise<string> {
if (sanitizedQuestion === '') { if (sanitizedQuestion === '') {
return ''; return '';
} }
const documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments); const documents = await vectorStore.similaritySearch(sanitizedQuestion, numDocuments);
return documents let output = '';
.map((doc) => doc.pageContent)
.join(', ') await CollectionUtil.asyncForEach(documents, async (doc) => {
.trim()
.replaceAll('\n', ' '); const metadata = doc.metadata;
const filename = Path.basename(metadata.source);
const page = await ChatServices.findPageForTextInterval(doc.metadata.source,
metadata.loc.lines.from, metadata.loc.lines.to);
output = `${output}\n\n\n\nThe following context is coming from ${filename} at page: ${page},
memorize this block among document information and return when you are refering this part of content:\n\n\n\n ${doc.pageContent} \n\n\n\n.`;
});
return output;
} }
public static async findPageForTextInterval(pdfPath, startLine, endLine) {
const data = new Uint8Array(Fs.readFileSync(pdfPath));
const pdf = await getDocument({ data }).promise;
// Loop através de cada página para encontrar o intervalo de texto
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const text = textContent.items.map(item => item['str']).join('\n');
const lines = text.split('\n');
const numLines = lines.length;
// Verificar se o intervalo de texto está nesta página
const nextPageLine = (i === pdf.numPages) ? numLines : endLine;
if (startLine <= numLines && endLine >= 0 && startLine <= nextPageLine) {
return i;
}
startLine -= numLines;
endLine -= numLines;
}
return -1; // Intervalo de texto não encontrado
}
/** /**
* Generate text * Generate text
* *
@ -234,6 +301,11 @@ export class ChatServices {
const combineDocumentsPrompt = ChatPromptTemplate.fromMessages([ const combineDocumentsPrompt = ChatPromptTemplate.fromMessages([
AIMessagePromptTemplate.fromTemplate( AIMessagePromptTemplate.fromTemplate(
` `
This is a sectioned context.
Very important: When answering, *mention in the answer* the PDF filename and page number related to each block of information used to answer.
Eg.: filename.pdf, page 3 - filename2.pdf, page 55.
\n\n{context}\n\n \n\n{context}\n\n
And using \n\n{chat_history}\n\n And using \n\n{chat_history}\n\n
@ -278,7 +350,7 @@ export class ChatServices {
}, },
context: async (output: string) => { context: async (output: string) => {
const c = await ChatServices.getRelevantContext(docsContext, output); const c = await ChatServices.getRelevantContext(docsContext, output);
return `${systemPrompt} \n ${c ? 'Use this context to answer:\n' + c: 'answer just with user question.'}`; return `${systemPrompt} \n ${c ? 'Use this context to answer:\n' + c : 'answer just with user question.'}`;
}, },
}, },