2025-11-26 22:54:22 -03:00
|
|
|
use anyhow::Result;
|
Fix tasks UI, WebSocket progress, memory monitoring, and app generator
Tasks UI fixes:
- Fix task list to query auto_tasks table instead of tasks table
- Fix task detail endpoint to use UUID binding for auto_tasks query
- Add proper filter handling: complete, active, awaiting, paused, blocked
- Add TaskStats fields: awaiting, paused, blocked, time_saved
- Add /api/tasks/time-saved endpoint
- Add count-all to stats HTML response
App generator improvements:
- Add AgentActivity struct for detailed terminal-style progress
- Add emit_activity method for rich progress events
- Add detailed logging for LLM calls with timing
- Track files_written, tables_synced, bytes_generated
Memory and performance:
- Add memory_monitor module for tracking RSS and thread activity
- Skip 0-byte files in drive monitor and document processor
- Change DRIVE_MONITOR checking logs from info to trace
- Remove unused profile_section macro
WebSocket progress:
- Ensure TaskProgressEvent includes activity field
- Add with_activity builder method
2025-12-30 22:42:32 -03:00
|
|
|
use log::{debug, error, info, warn};
|
2025-11-26 22:54:22 -03:00
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
use std::path::Path;
|
|
|
|
|
use tokio::io::AsyncReadExt;
|
Add video module, RBAC, security features, billing, contacts, dashboards, learn, social, and multiple new modules
Major additions:
- Video editing engine with AI features (transcription, captions, TTS, scene detection)
- RBAC middleware and organization management
- Security enhancements (MFA, passkey, DLP, encryption, audit)
- Billing and subscription management
- Contacts management
- Dashboards module
- Learn/LMS module
- Social features
- Compliance (SOC2, SOP middleware, vulnerability scanner)
- New migrations for RBAC, learn, and video tables
2026-01-08 13:16:17 -03:00
|
|
|
use crate::security::command_guard::SafeCommand;
|
2025-11-26 22:54:22 -03:00
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
|
|
pub enum DocumentFormat {
|
|
|
|
|
PDF,
|
|
|
|
|
DOCX,
|
|
|
|
|
XLSX,
|
|
|
|
|
PPTX,
|
|
|
|
|
TXT,
|
|
|
|
|
MD,
|
|
|
|
|
HTML,
|
|
|
|
|
RTF,
|
|
|
|
|
CSV,
|
|
|
|
|
JSON,
|
|
|
|
|
XML,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl DocumentFormat {
|
|
|
|
|
pub fn from_extension(path: &Path) -> Option<Self> {
|
|
|
|
|
let ext = path.extension()?.to_str()?.to_lowercase();
|
|
|
|
|
match ext.as_str() {
|
|
|
|
|
"pdf" => Some(Self::PDF),
|
|
|
|
|
"docx" => Some(Self::DOCX),
|
|
|
|
|
"xlsx" => Some(Self::XLSX),
|
|
|
|
|
"pptx" => Some(Self::PPTX),
|
|
|
|
|
"txt" => Some(Self::TXT),
|
|
|
|
|
"md" | "markdown" => Some(Self::MD),
|
|
|
|
|
"html" | "htm" => Some(Self::HTML),
|
|
|
|
|
"rtf" => Some(Self::RTF),
|
|
|
|
|
"csv" => Some(Self::CSV),
|
|
|
|
|
"json" => Some(Self::JSON),
|
|
|
|
|
"xml" => Some(Self::XML),
|
|
|
|
|
_ => None,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn max_size(&self) -> usize {
|
|
|
|
|
match self {
|
2025-12-23 18:40:58 -03:00
|
|
|
Self::PDF => 500 * 1024 * 1024,
|
|
|
|
|
Self::PPTX => 200 * 1024 * 1024,
|
2025-12-26 08:59:25 -03:00
|
|
|
Self::DOCX | Self::XLSX | Self::TXT | Self::JSON | Self::XML => 100 * 1024 * 1024,
|
|
|
|
|
Self::HTML | Self::RTF => 50 * 1024 * 1024,
|
2025-12-23 18:40:58 -03:00
|
|
|
Self::MD => 10 * 1024 * 1024,
|
|
|
|
|
Self::CSV => 1024 * 1024 * 1024,
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct DocumentMetadata {
|
|
|
|
|
pub title: Option<String>,
|
|
|
|
|
pub author: Option<String>,
|
|
|
|
|
pub creation_date: Option<String>,
|
|
|
|
|
pub modification_date: Option<String>,
|
|
|
|
|
pub page_count: Option<usize>,
|
|
|
|
|
pub word_count: Option<usize>,
|
|
|
|
|
pub language: Option<String>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct TextChunk {
|
|
|
|
|
pub content: String,
|
|
|
|
|
pub metadata: ChunkMetadata,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct ChunkMetadata {
|
|
|
|
|
pub document_path: String,
|
|
|
|
|
pub document_title: Option<String>,
|
|
|
|
|
pub chunk_index: usize,
|
|
|
|
|
pub total_chunks: usize,
|
|
|
|
|
pub start_char: usize,
|
|
|
|
|
pub end_char: usize,
|
|
|
|
|
pub page_number: Option<usize>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
|
pub struct DocumentProcessor {
|
|
|
|
|
chunk_size: usize,
|
|
|
|
|
chunk_overlap: usize,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Default for DocumentProcessor {
|
|
|
|
|
fn default() -> Self {
|
|
|
|
|
Self {
|
2025-12-23 18:40:58 -03:00
|
|
|
chunk_size: 1000,
|
|
|
|
|
chunk_overlap: 200,
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl DocumentProcessor {
|
|
|
|
|
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
|
|
|
|
|
Self {
|
|
|
|
|
chunk_size,
|
|
|
|
|
chunk_overlap,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn chunk_size(&self) -> usize {
|
|
|
|
|
self.chunk_size
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn chunk_overlap(&self) -> usize {
|
|
|
|
|
self.chunk_overlap
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub async fn process_document(&self, file_path: &Path) -> Result<Vec<TextChunk>> {
|
|
|
|
|
if !file_path.exists() {
|
2025-12-26 08:59:25 -03:00
|
|
|
return Err(anyhow::anyhow!("File not found: {}", file_path.display()));
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let metadata = tokio::fs::metadata(file_path).await?;
|
|
|
|
|
let file_size = metadata.len() as usize;
|
|
|
|
|
|
Fix tasks UI, WebSocket progress, memory monitoring, and app generator
Tasks UI fixes:
- Fix task list to query auto_tasks table instead of tasks table
- Fix task detail endpoint to use UUID binding for auto_tasks query
- Add proper filter handling: complete, active, awaiting, paused, blocked
- Add TaskStats fields: awaiting, paused, blocked, time_saved
- Add /api/tasks/time-saved endpoint
- Add count-all to stats HTML response
App generator improvements:
- Add AgentActivity struct for detailed terminal-style progress
- Add emit_activity method for rich progress events
- Add detailed logging for LLM calls with timing
- Track files_written, tables_synced, bytes_generated
Memory and performance:
- Add memory_monitor module for tracking RSS and thread activity
- Skip 0-byte files in drive monitor and document processor
- Change DRIVE_MONITOR checking logs from info to trace
- Remove unused profile_section macro
WebSocket progress:
- Ensure TaskProgressEvent includes activity field
- Add with_activity builder method
2025-12-30 22:42:32 -03:00
|
|
|
if file_size == 0 {
|
|
|
|
|
debug!(
|
|
|
|
|
"Skipping empty file (0 bytes): {}",
|
|
|
|
|
file_path.display()
|
|
|
|
|
);
|
|
|
|
|
return Ok(Vec::new());
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let format = DocumentFormat::from_extension(file_path)
|
2025-12-26 08:59:25 -03:00
|
|
|
.ok_or_else(|| anyhow::anyhow!("Unsupported file format: {}", file_path.display()))?;
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
if file_size > format.max_size() {
|
|
|
|
|
return Err(anyhow::anyhow!(
|
|
|
|
|
"File too large: {} bytes (max: {} bytes)",
|
|
|
|
|
file_size,
|
|
|
|
|
format.max_size()
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!(
|
2025-12-26 08:59:25 -03:00
|
|
|
"Processing document: {} (format: {:?}, size: {} bytes)",
|
|
|
|
|
file_path.display(),
|
|
|
|
|
format,
|
|
|
|
|
file_size
|
2025-11-26 22:54:22 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let text = self.extract_text(file_path, format).await?;
|
|
|
|
|
|
2025-12-26 08:59:25 -03:00
|
|
|
let cleaned_text = Self::clean_text(&text);
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
let chunks = self.create_chunks(&cleaned_text, file_path);
|
|
|
|
|
|
|
|
|
|
info!(
|
2025-12-26 08:59:25 -03:00
|
|
|
"Created {} chunks from document: {}",
|
2025-11-26 22:54:22 -03:00
|
|
|
chunks.len(),
|
2025-12-26 08:59:25 -03:00
|
|
|
file_path.display()
|
2025-11-26 22:54:22 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
|
|
Ok(chunks)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn extract_text(&self, file_path: &Path, format: DocumentFormat) -> Result<String> {
|
|
|
|
|
match format {
|
|
|
|
|
DocumentFormat::TXT | DocumentFormat::MD => {
|
|
|
|
|
let mut file = tokio::fs::File::open(file_path).await?;
|
|
|
|
|
let mut contents = String::new();
|
|
|
|
|
file.read_to_string(&mut contents).await?;
|
|
|
|
|
Ok(contents)
|
|
|
|
|
}
|
|
|
|
|
DocumentFormat::PDF => self.extract_pdf_text(file_path).await,
|
|
|
|
|
DocumentFormat::DOCX => self.extract_docx_text(file_path).await,
|
|
|
|
|
DocumentFormat::HTML => self.extract_html_text(file_path).await,
|
|
|
|
|
DocumentFormat::CSV => self.extract_csv_text(file_path).await,
|
|
|
|
|
DocumentFormat::JSON => self.extract_json_text(file_path).await,
|
|
|
|
|
_ => {
|
|
|
|
|
warn!(
|
|
|
|
|
"Format {:?} extraction not yet implemented, using fallback",
|
|
|
|
|
format
|
|
|
|
|
);
|
|
|
|
|
self.fallback_text_extraction(file_path).await
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn extract_pdf_text(&self, file_path: &Path) -> Result<String> {
|
Add video module, RBAC, security features, billing, contacts, dashboards, learn, social, and multiple new modules
Major additions:
- Video editing engine with AI features (transcription, captions, TTS, scene detection)
- RBAC middleware and organization management
- Security enhancements (MFA, passkey, DLP, encryption, audit)
- Billing and subscription management
- Contacts management
- Dashboards module
- Learn/LMS module
- Social features
- Compliance (SOC2, SOP middleware, vulnerability scanner)
- New migrations for RBAC, learn, and video tables
2026-01-08 13:16:17 -03:00
|
|
|
let file_path_str = file_path.to_string_lossy().to_string();
|
|
|
|
|
let cmd_result = SafeCommand::new("pdftotext")
|
|
|
|
|
.and_then(|c| c.arg("-layout"))
|
|
|
|
|
.and_then(|c| c.arg(&file_path_str))
|
|
|
|
|
.and_then(|c| c.arg("-"));
|
|
|
|
|
|
|
|
|
|
let output = match cmd_result {
|
|
|
|
|
Ok(cmd) => cmd.execute_async().await,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
warn!("Failed to build pdftotext command: {}", e);
|
|
|
|
|
return self.extract_pdf_with_library(file_path);
|
|
|
|
|
}
|
|
|
|
|
};
|
2025-11-26 22:54:22 -03:00
|
|
|
|
|
|
|
|
match output {
|
|
|
|
|
Ok(output) if output.status.success() => {
|
2025-12-26 08:59:25 -03:00
|
|
|
info!(
|
|
|
|
|
"Successfully extracted PDF with pdftotext: {}",
|
|
|
|
|
file_path.display()
|
|
|
|
|
);
|
2025-11-26 22:54:22 -03:00
|
|
|
Ok(String::from_utf8_lossy(&output.stdout).to_string())
|
|
|
|
|
}
|
|
|
|
|
_ => {
|
|
|
|
|
warn!(
|
2025-12-26 08:59:25 -03:00
|
|
|
"pdftotext failed for {}, trying library extraction",
|
|
|
|
|
file_path.display()
|
2025-11-26 22:54:22 -03:00
|
|
|
);
|
feat(autotask): Implement AutoTask system with intent classification and app generation
- Add IntentClassifier with 7 intent types (APP_CREATE, TODO, MONITOR, ACTION, SCHEDULE, GOAL, TOOL)
- Add AppGenerator with LLM-powered app structure analysis
- Add DesignerAI for modifying apps through conversation
- Add app_server for serving generated apps with clean URLs
- Add db_api for CRUD operations on bot database tables
- Add ask_later keyword for pending info collection
- Add migration 6.1.1 with tables: pending_info, auto_tasks, execution_plans, task_approvals, task_decisions, safety_audit_log, generated_apps, intent_classifications, designer_changes
- Write apps to S3 drive and sync to SITE_ROOT for serving
- Clean URL structure: /apps/{app_name}/
- Integrate with DriveMonitor for file sync
Based on Chapter 17 - Autonomous Tasks specification
2025-12-27 21:10:09 -03:00
|
|
|
self.extract_pdf_with_library(file_path)
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
feat(autotask): Implement AutoTask system with intent classification and app generation
- Add IntentClassifier with 7 intent types (APP_CREATE, TODO, MONITOR, ACTION, SCHEDULE, GOAL, TOOL)
- Add AppGenerator with LLM-powered app structure analysis
- Add DesignerAI for modifying apps through conversation
- Add app_server for serving generated apps with clean URLs
- Add db_api for CRUD operations on bot database tables
- Add ask_later keyword for pending info collection
- Add migration 6.1.1 with tables: pending_info, auto_tasks, execution_plans, task_approvals, task_decisions, safety_audit_log, generated_apps, intent_classifications, designer_changes
- Write apps to S3 drive and sync to SITE_ROOT for serving
- Clean URL structure: /apps/{app_name}/
- Integrate with DriveMonitor for file sync
Based on Chapter 17 - Autonomous Tasks specification
2025-12-27 21:10:09 -03:00
|
|
|
fn extract_pdf_with_library(&self, file_path: &Path) -> Result<String> {
|
|
|
|
|
let _ = self; // Suppress unused self warning
|
2026-01-24 22:04:47 -03:00
|
|
|
#[cfg(feature = "drive")]
|
|
|
|
|
{
|
|
|
|
|
use pdf_extract::extract_text;
|
2025-11-26 22:54:22 -03:00
|
|
|
|
2026-01-24 22:04:47 -03:00
|
|
|
match extract_text(file_path) {
|
|
|
|
|
Ok(text) => {
|
|
|
|
|
info!(
|
|
|
|
|
"Successfully extracted PDF with library: {}",
|
|
|
|
|
file_path.display()
|
|
|
|
|
);
|
|
|
|
|
return Ok(text);
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
warn!("PDF library extraction failed: {}", e);
|
|
|
|
|
}
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
}
|
2026-01-24 22:04:47 -03:00
|
|
|
|
|
|
|
|
Self::extract_pdf_basic_sync(file_path)
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
|
2025-12-26 08:59:25 -03:00
|
|
|
fn extract_pdf_basic_sync(file_path: &Path) -> Result<String> {
|
2026-01-24 22:04:47 -03:00
|
|
|
#[cfg(feature = "drive")]
|
|
|
|
|
{
|
|
|
|
|
if let Ok(text) = pdf_extract::extract_text(file_path) {
|
|
|
|
|
if !text.is_empty() {
|
|
|
|
|
return Ok(text);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Err(anyhow::anyhow!(
|
|
|
|
|
"Could not extract text from PDF. Please ensure pdftotext is installed."
|
|
|
|
|
))
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn extract_docx_text(&self, file_path: &Path) -> Result<String> {
|
Add video module, RBAC, security features, billing, contacts, dashboards, learn, social, and multiple new modules
Major additions:
- Video editing engine with AI features (transcription, captions, TTS, scene detection)
- RBAC middleware and organization management
- Security enhancements (MFA, passkey, DLP, encryption, audit)
- Billing and subscription management
- Contacts management
- Dashboards module
- Learn/LMS module
- Social features
- Compliance (SOC2, SOP middleware, vulnerability scanner)
- New migrations for RBAC, learn, and video tables
2026-01-08 13:16:17 -03:00
|
|
|
let file_path_str = file_path.to_string_lossy().to_string();
|
|
|
|
|
let cmd_result = SafeCommand::new("pandoc")
|
|
|
|
|
.and_then(|c| c.arg("-f"))
|
|
|
|
|
.and_then(|c| c.arg("docx"))
|
|
|
|
|
.and_then(|c| c.arg("-t"))
|
|
|
|
|
.and_then(|c| c.arg("plain"))
|
|
|
|
|
.and_then(|c| c.arg(&file_path_str));
|
|
|
|
|
|
|
|
|
|
let output = match cmd_result {
|
|
|
|
|
Ok(cmd) => cmd.execute_async().await,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
warn!("Failed to build pandoc command: {}", e);
|
|
|
|
|
return self.fallback_text_extraction(file_path).await;
|
|
|
|
|
}
|
|
|
|
|
};
|
2025-11-26 22:54:22 -03:00
|
|
|
|
|
|
|
|
match output {
|
|
|
|
|
Ok(output) if output.status.success() => {
|
|
|
|
|
Ok(String::from_utf8_lossy(&output.stdout).to_string())
|
|
|
|
|
}
|
|
|
|
|
_ => {
|
|
|
|
|
warn!("pandoc failed for DOCX, using fallback");
|
|
|
|
|
self.fallback_text_extraction(file_path).await
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn extract_html_text(&self, file_path: &Path) -> Result<String> {
|
|
|
|
|
let contents = tokio::fs::read_to_string(file_path).await?;
|
|
|
|
|
|
|
|
|
|
let text = contents
|
|
|
|
|
.split('<')
|
|
|
|
|
.flat_map(|s| s.split('>').skip(1))
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
.join(" ");
|
|
|
|
|
|
|
|
|
|
Ok(text)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn extract_csv_text(&self, file_path: &Path) -> Result<String> {
|
|
|
|
|
let contents = tokio::fs::read_to_string(file_path).await?;
|
|
|
|
|
|
|
|
|
|
let mut text = String::new();
|
|
|
|
|
for line in contents.lines() {
|
|
|
|
|
text.push_str(line);
|
|
|
|
|
text.push('\n');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(text)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn extract_json_text(&self, file_path: &Path) -> Result<String> {
|
|
|
|
|
let contents = tokio::fs::read_to_string(file_path).await?;
|
|
|
|
|
|
|
|
|
|
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&contents) {
|
2025-12-26 08:59:25 -03:00
|
|
|
Ok(Self::extract_json_strings(&json))
|
2025-11-26 22:54:22 -03:00
|
|
|
} else {
|
|
|
|
|
Ok(contents)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-26 08:59:25 -03:00
|
|
|
fn extract_json_strings(value: &serde_json::Value) -> String {
|
2025-11-26 22:54:22 -03:00
|
|
|
let mut result = String::new();
|
|
|
|
|
|
|
|
|
|
match value {
|
|
|
|
|
serde_json::Value::String(s) => {
|
|
|
|
|
result.push_str(s);
|
|
|
|
|
result.push(' ');
|
|
|
|
|
}
|
|
|
|
|
serde_json::Value::Array(arr) => {
|
|
|
|
|
for item in arr {
|
2025-12-26 08:59:25 -03:00
|
|
|
result.push_str(&Self::extract_json_strings(item));
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
serde_json::Value::Object(map) => {
|
|
|
|
|
for (_key, val) in map {
|
2025-12-26 08:59:25 -03:00
|
|
|
result.push_str(&Self::extract_json_strings(val));
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
_ => {}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn fallback_text_extraction(&self, file_path: &Path) -> Result<String> {
|
|
|
|
|
match tokio::fs::read_to_string(file_path).await {
|
|
|
|
|
Ok(contents) => Ok(contents),
|
|
|
|
|
Err(_) => {
|
|
|
|
|
let bytes = tokio::fs::read(file_path).await?;
|
|
|
|
|
Ok(String::from_utf8_lossy(&bytes).to_string())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-26 08:59:25 -03:00
|
|
|
fn clean_text(text: &str) -> String {
|
2025-11-26 22:54:22 -03:00
|
|
|
let cleaned = text
|
|
|
|
|
.lines()
|
|
|
|
|
.map(|line| line.trim())
|
|
|
|
|
.filter(|line| !line.is_empty())
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
.join("\n");
|
|
|
|
|
|
|
|
|
|
cleaned
|
|
|
|
|
.chars()
|
|
|
|
|
.filter(|c| !c.is_control() || c.is_whitespace())
|
|
|
|
|
.collect::<String>()
|
|
|
|
|
.split_whitespace()
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
.join(" ")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn create_chunks(&self, text: &str, file_path: &Path) -> Vec<TextChunk> {
|
|
|
|
|
let mut chunks = Vec::new();
|
|
|
|
|
let chars: Vec<char> = text.chars().collect();
|
|
|
|
|
let total_chars = chars.len();
|
|
|
|
|
|
|
|
|
|
if total_chars == 0 {
|
|
|
|
|
return chunks;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut start = 0;
|
|
|
|
|
let mut chunk_index = 0;
|
|
|
|
|
|
|
|
|
|
let step_size = self.chunk_size.saturating_sub(self.chunk_overlap);
|
|
|
|
|
let total_chunks = if step_size > 0 {
|
2025-12-26 08:59:25 -03:00
|
|
|
total_chars.div_ceil(step_size)
|
2025-11-26 22:54:22 -03:00
|
|
|
} else {
|
|
|
|
|
1
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
while start < total_chars {
|
|
|
|
|
let end = std::cmp::min(start + self.chunk_size, total_chars);
|
|
|
|
|
|
|
|
|
|
let mut chunk_end = end;
|
|
|
|
|
if end < total_chars {
|
|
|
|
|
for i in (start..end).rev() {
|
|
|
|
|
if chars[i].is_whitespace() {
|
|
|
|
|
chunk_end = i + 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let chunk_content: String = chars[start..chunk_end].iter().collect();
|
|
|
|
|
|
|
|
|
|
chunks.push(TextChunk {
|
|
|
|
|
content: chunk_content,
|
|
|
|
|
metadata: ChunkMetadata {
|
|
|
|
|
document_path: file_path.to_string_lossy().to_string(),
|
|
|
|
|
document_title: file_path
|
|
|
|
|
.file_stem()
|
|
|
|
|
.and_then(|s| s.to_str())
|
|
|
|
|
.map(|s| s.to_string()),
|
|
|
|
|
chunk_index,
|
|
|
|
|
total_chunks,
|
|
|
|
|
start_char: start,
|
|
|
|
|
end_char: chunk_end,
|
2025-12-23 18:40:58 -03:00
|
|
|
page_number: None,
|
2025-11-26 22:54:22 -03:00
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
chunk_index += 1;
|
|
|
|
|
|
|
|
|
|
start = if chunk_end >= self.chunk_overlap {
|
|
|
|
|
chunk_end - self.chunk_overlap
|
|
|
|
|
} else {
|
|
|
|
|
chunk_end
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if start >= total_chars {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
chunks
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub async fn process_kb_folder(
|
|
|
|
|
&self,
|
|
|
|
|
kb_path: &Path,
|
|
|
|
|
) -> Result<HashMap<String, Vec<TextChunk>>> {
|
|
|
|
|
let mut results = HashMap::new();
|
|
|
|
|
|
|
|
|
|
if !kb_path.exists() {
|
|
|
|
|
return Err(anyhow::anyhow!(
|
2025-12-26 08:59:25 -03:00
|
|
|
"Knowledge base folder not found: {}",
|
|
|
|
|
kb_path.display()
|
2025-11-26 22:54:22 -03:00
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-26 08:59:25 -03:00
|
|
|
info!("Processing knowledge base folder: {}", kb_path.display());
|
2025-12-23 18:40:58 -03:00
|
|
|
|
2025-11-26 22:54:22 -03:00
|
|
|
self.process_directory_recursive(kb_path, &mut results)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
info!("Processed {} documents in knowledge base", results.len());
|
|
|
|
|
|
|
|
|
|
Ok(results)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn process_directory_recursive<'a>(
|
|
|
|
|
&'a self,
|
|
|
|
|
dir: &'a Path,
|
|
|
|
|
results: &'a mut HashMap<String, Vec<TextChunk>>,
|
|
|
|
|
) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send + 'a>> {
|
|
|
|
|
Box::pin(async move {
|
|
|
|
|
let mut entries = tokio::fs::read_dir(dir).await?;
|
|
|
|
|
|
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
|
|
|
let path = entry.path();
|
|
|
|
|
let metadata = entry.metadata().await?;
|
|
|
|
|
|
|
|
|
|
if metadata.is_dir() {
|
|
|
|
|
self.process_directory_recursive(&path, results).await?;
|
2025-12-26 08:59:25 -03:00
|
|
|
} else if metadata.is_file() && DocumentFormat::from_extension(&path).is_some() {
|
|
|
|
|
match self.process_document(&path).await {
|
|
|
|
|
Ok(chunks) => {
|
|
|
|
|
let key = path.to_string_lossy().to_string();
|
|
|
|
|
results.insert(key, chunks);
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
error!("Failed to process document {}: {}", path.display(), e);
|
2025-11-26 22:54:22 -03:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|