feat(office): Add ooxmlsdk integration for Word/PowerPoint preservation

2026-01-11 12:01:58 -03:00 · 2026-01-11 12:01:58 -03:00 · 1850564e62
commit 1850564e62
parent 8a9a913ffb
7 changed files with 919 additions and 340 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -211,8 +211,9 @@ rust_xlsxwriter = "0.79"
 spreadsheet-ods = "1.0"

 # Word/PowerPoint Support - MS Office 100% Compatibility
+# ooxmlsdk preserves: Full document structure at XML level (100% round-trip)
 docx-rs = "0.4"
-ooxmlsdk = { version = "0.3", features = ["docx", "pptx"] }
+ooxmlsdk = { version = "0.3", features = ["docx", "pptx", "parts", "office2021"] }
 # ppt-rs disabled due to version conflict - using ooxmlsdk for PPTX support instead
 # ppt-rs = { version = "0.2", default-features = false }

--- a/src/docs/mod.rs
+++ b/src/docs/mod.rs
@ -1,5 +1,6 @@
 pub mod collaboration;
 pub mod handlers;
+pub mod ooxml;
 pub mod storage;
 pub mod types;
 pub mod utils;
--- a/src/docs/ooxml.rs
+++ b/src/docs/ooxml.rs
@ -0,0 +1,250 @@
+use std::io::Cursor;
+
+pub struct OoxmlDocument {
+    pub original_bytes: Vec<u8>,
+    pub paragraphs: Vec<ParagraphInfo>,
+}
+
+pub struct ParagraphInfo {
+    pub text: String,
+    pub index: usize,
+}
+
+pub fn load_docx_preserving(bytes: &[u8]) -> Result<OoxmlDocument, String> {
+    use ooxmlsdk::parts::wordprocessing_document::WordprocessingDocument;
+
+    let reader = Cursor::new(bytes);
+    let docx = WordprocessingDocument::new(reader)
+        .map_err(|e| format!("Failed to parse DOCX: {e}"))?;
+
+    let xml_str = docx
+        .main_document_part
+        .root_element
+        .to_xml()
+        .unwrap_or_default();
+
+    let paragraphs = extract_paragraphs(&xml_str);
+
+    Ok(OoxmlDocument {
+        original_bytes: bytes.to_vec(),
+        paragraphs,
+    })
+}
+
+fn extract_paragraphs(xml: &str) -> Vec<ParagraphInfo> {
+    let mut paragraphs = Vec::new();
+    let mut para_index = 0;
+
+    let mut pos = 0;
+    while let Some(p_start) = xml[pos..].find("<w:p") {
+        let abs_start = pos + p_start;
+
+        if let Some(p_end_rel) = xml[abs_start..].find("</w:p>") {
+            let abs_end = abs_start + p_end_rel + 6;
+            let para_content = &xml[abs_start..abs_end];
+
+            let text = extract_text_from_paragraph(para_content);
+            if !text.trim().is_empty() {
+                paragraphs.push(ParagraphInfo {
+                    text,
+                    index: para_index,
+                });
+            }
+            para_index += 1;
+            pos = abs_end;
+        } else {
+            break;
+        }
+    }
+
+    paragraphs
+}
+
+fn extract_text_from_paragraph(para_xml: &str) -> String {
+    let mut text = String::new();
+    let mut pos = 0;
+
+    while let Some(t_start) = para_xml[pos..].find("<w:t") {
+        let abs_start = pos + t_start;
+
+        if let Some(content_start_rel) = para_xml[abs_start..].find('>') {
+            let abs_content_start = abs_start + content_start_rel + 1;
+
+            if let Some(t_end_rel) = para_xml[abs_content_start..].find("</w:t>") {
+                let content = &para_xml[abs_content_start..abs_content_start + t_end_rel];
+                text.push_str(content);
+                pos = abs_content_start + t_end_rel + 6;
+            } else {
+                break;
+            }
+        } else {
+            break;
+        }
+    }
+
+    unescape_xml(&text)
+}
+
+fn unescape_xml(text: &str) -> String {
+    text.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&apos;", "'")
+}
+
+fn escape_xml(text: &str) -> String {
+    text.replace('&', "&amp;")
+        .replace('<', "&lt;")
+        .replace('>', "&gt;")
+        .replace('"', "&quot;")
+        .replace('\'', "&apos;")
+}
+
+pub fn save_docx_preserving(original_bytes: &[u8]) -> Result<Vec<u8>, String> {
+    use ooxmlsdk::parts::wordprocessing_document::WordprocessingDocument;
+
+    let reader = Cursor::new(original_bytes);
+    let docx = WordprocessingDocument::new(reader)
+        .map_err(|e| format!("Failed to parse DOCX: {e}"))?;
+
+    let mut output = Cursor::new(Vec::new());
+    docx.save(&mut output)
+        .map_err(|e| format!("Failed to save DOCX: {e}"))?;
+
+    Ok(output.into_inner())
+}
+
+pub fn update_docx_text(
+    original_bytes: &[u8],
+    new_paragraphs: &[String],
+) -> Result<Vec<u8>, String> {
+    use std::io::{Read, Write};
+    use zip::{write::SimpleFileOptions, ZipArchive, ZipWriter};
+
+    let reader = Cursor::new(original_bytes);
+    let mut archive =
+        ZipArchive::new(reader).map_err(|e| format!("Failed to open DOCX archive: {e}"))?;
+
+    let mut output_buf = Cursor::new(Vec::new());
+    {
+        let mut zip_writer = ZipWriter::new(&mut output_buf);
+        let options =
+            SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated);
+
+        for i in 0..archive.len() {
+            let mut file = archive
+                .by_index(i)
+                .map_err(|e| format!("Failed to read archive entry: {e}"))?;
+
+            let name = file.name().to_string();
+
+            if name == "word/document.xml" {
+                let mut content = String::new();
+                file.read_to_string(&mut content)
+                    .map_err(|e| format!("Failed to read document.xml: {e}"))?;
+
+                let modified_content = replace_paragraph_texts(&content, new_paragraphs);
+
+                zip_writer
+                    .start_file(&name, options)
+                    .map_err(|e| format!("Failed to start file in zip: {e}"))?;
+                zip_writer
+                    .write_all(modified_content.as_bytes())
+                    .map_err(|e| format!("Failed to write document.xml: {e}"))?;
+            } else {
+                let mut buf = Vec::new();
+                file.read_to_end(&mut buf)
+                    .map_err(|e| format!("Failed to read file: {e}"))?;
+
+                zip_writer
+                    .start_file(&name, options)
+                    .map_err(|e| format!("Failed to start file in zip: {e}"))?;
+                zip_writer
+                    .write_all(&buf)
+                    .map_err(|e| format!("Failed to write file: {e}"))?;
+            }
+        }
+
+        zip_writer
+            .finish()
+            .map_err(|e| format!("Failed to finish zip: {e}"))?;
+    }
+
+    Ok(output_buf.into_inner())
+}
+
+fn replace_paragraph_texts(xml: &str, new_paragraphs: &[String]) -> String {
+    let mut result = xml.to_string();
+    let mut para_idx = 0;
+    let mut search_pos = 0;
+
+    while let Some(p_start) = result[search_pos..]
+        .find("<w:p ")
+        .or_else(|| result[search_pos..].find("<w:p>"))
+    {
+        let abs_start = search_pos + p_start;
+
+        if let Some(p_end_rel) = result[abs_start..].find("</w:p>") {
+            let abs_end = abs_start + p_end_rel + 6;
+            let para_content = result[abs_start..abs_end].to_string();
+
+            if para_content.contains("<w:t") {
+                if para_idx < new_paragraphs.len() {
+                    let new_para = replace_first_text_run(&para_content, &new_paragraphs[para_idx]);
+                    let new_len = new_para.len();
+                    result = format!("{}{}{}", &result[..abs_start], new_para, &result[abs_end..]);
+                    search_pos = abs_start + new_len;
+                } else {
+                    search_pos = abs_end;
+                }
+                para_idx += 1;
+            } else {
+                search_pos = abs_end;
+            }
+        } else {
+            break;
+        }
+    }
+
+    result
+}
+
+fn replace_first_text_run(para_xml: &str, new_text: &str) -> String {
+    let mut result = para_xml.to_string();
+    let mut found_first = false;
+
+    let mut search_pos = 0;
+    while let Some(t_start) = result[search_pos..].find("<w:t") {
+        let abs_start = search_pos + t_start;
+
+        if let Some(tag_end_rel) = result[abs_start..].find('>') {
+            let abs_content_start = abs_start + tag_end_rel + 1;
+
+            if let Some(t_end_rel) = result[abs_content_start..].find("</w:t>") {
+                let abs_content_end = abs_content_start + t_end_rel;
+
+                if !found_first {
+                    let escaped = escape_xml(new_text);
+                    result = format!(
+                        "{}{}{}",
+                        &result[..abs_content_start],
+                        escaped,
+                        &result[abs_content_end..]
+                    );
+                    found_first = true;
+                    search_pos = abs_content_start + escaped.len() + 6;
+                } else {
+                    result = format!("{}{}", &result[..abs_content_start], &result[abs_content_end..]);
+                    search_pos = abs_content_start;
+                }
+            } else {
+                break;
+            }
+        } else {
+            break;
+        }
+    }
+
+    result
+}
--- a/src/docs/storage.rs
+++ b/src/docs/storage.rs
@ -1,16 +1,24 @@
+use crate::docs::ooxml::{load_docx_preserving, update_docx_text};
 use crate::docs::types::{Document, DocumentMetadata};
 use crate::shared::state::AppState;
 use aws_sdk_s3::primitives::ByteStream;
 use chrono::{DateTime, Utc};
+use std::collections::HashMap;
 use std::io::Cursor;
 use std::sync::Arc;
+use tokio::sync::RwLock;
 use uuid::Uuid;

+static DOCUMENT_CACHE: once_cell::sync::Lazy<RwLock<HashMap<String, (Vec<u8>, DateTime<Utc>)>>> =
+    once_cell::sync::Lazy::new(|| RwLock::new(HashMap::new()));
+
+const CACHE_TTL_SECS: i64 = 3600;
+
 pub fn get_user_docs_path(user_identifier: &str) -> String {
    let safe_id = user_identifier
        .replace(['/', '\\', ':', '*', '?', '"', '<', '>', '|'], "_")
        .to_lowercase();
-    format!("users/{}/docs", safe_id)
+    format!("users/{safe_id}/docs")
 }

 pub fn get_current_user_id() -> String {
@ -21,287 +29,22 @@ pub fn generate_doc_id() -> String {
    Uuid::new_v4().to_string()
 }

-pub async fn save_document_to_drive(
-    state: &Arc<AppState>,
-    user_identifier: &str,
-    doc_id: &str,
-    title: &str,
-    content: &str,
-) -> Result<String, String> {
-    let s3_client = state.drive.as_ref().ok_or("S3 service not available")?;
+pub async fn cache_document_bytes(doc_id: &str, bytes: Vec<u8>) {
+    let mut cache = DOCUMENT_CACHE.write().await;
+    cache.insert(doc_id.to_string(), (bytes, Utc::now()));

-    let base_path = get_user_docs_path(user_identifier);
-    let doc_path = format!("{}/{}.html", base_path, doc_id);
-    let meta_path = format!("{}/{}.meta.json", base_path, doc_id);
-
-    s3_client
-        .put_object()
-        .bucket(&state.bucket_name)
-        .key(&doc_path)
-        .body(ByteStream::from(content.as_bytes().to_vec()))
-        .content_type("text/html")
-        .send()
-        .await
-        .map_err(|e| format!("Failed to save document: {e}"))?;
-
-    let word_count = content
-        .split_whitespace()
-        .filter(|w| !w.starts_with('<') && !w.ends_with('>'))
-        .count();
-
-    let metadata = serde_json::json!({
-        "id": doc_id,
-        "title": title,
-        "created_at": Utc::now().to_rfc3339(),
-        "updated_at": Utc::now().to_rfc3339(),
-        "word_count": word_count,
-        "version": 1
-    });
-
-    s3_client
-        .put_object()
-        .bucket(&state.bucket_name)
-        .key(&meta_path)
-        .body(ByteStream::from(metadata.to_string().into_bytes()))
-        .content_type("application/json")
-        .send()
-        .await
-        .map_err(|e| format!("Failed to save metadata: {e}"))?;
-
-    Ok(doc_path)
+    let now = Utc::now();
+    cache.retain(|_, (_, modified)| (now - *modified).num_seconds() < CACHE_TTL_SECS);
 }

-pub async fn save_document_as_docx(
-    state: &Arc<AppState>,
-    user_identifier: &str,
-    doc_id: &str,
-    title: &str,
-    content: &str,
-) -> Result<Vec<u8>, String> {
-    let docx_bytes = convert_html_to_docx(title, content)?;
-
-    let s3_client = state.drive.as_ref().ok_or("S3 service not available")?;
-    let base_path = get_user_docs_path(user_identifier);
-    let docx_path = format!("{}/{}.docx", base_path, doc_id);
-
-    s3_client
-        .put_object()
-        .bucket(&state.bucket_name)
-        .key(&docx_path)
-        .body(ByteStream::from(docx_bytes.clone()))
-        .content_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
-        .send()
-        .await
-        .map_err(|e| format!("Failed to save DOCX: {e}"))?;
-
-    Ok(docx_bytes)
+pub async fn get_cached_document_bytes(doc_id: &str) -> Option<Vec<u8>> {
+    let cache = DOCUMENT_CACHE.read().await;
+    cache.get(doc_id).map(|(bytes, _)| bytes.clone())
 }

-pub fn convert_html_to_docx(title: &str, html_content: &str) -> Result<Vec<u8>, String> {
-    use docx_rs::*;
-
-    let mut docx = Docx::new();
-
-    if !title.is_empty() {
-        let title_para = Paragraph::new()
-            .add_run(Run::new().add_text(title).bold().size(48));
-        docx = docx.add_paragraph(title_para);
-        docx = docx.add_paragraph(Paragraph::new());
-    }
-
-    let paragraphs = parse_html_to_paragraphs(html_content);
-    for para_data in paragraphs {
-        let mut paragraph = Paragraph::new();
-
-        match para_data.style.as_str() {
-            "h1" => {
-                paragraph = paragraph.add_run(
-                    Run::new()
-                        .add_text(&para_data.text)
-                        .bold()
-                        .size(32)
-                );
-            }
-            "h2" => {
-                paragraph = paragraph.add_run(
-                    Run::new()
-                        .add_text(&para_data.text)
-                        .bold()
-                        .size(28)
-                );
-            }
-            "h3" => {
-                paragraph = paragraph.add_run(
-                    Run::new()
-                        .add_text(&para_data.text)
-                        .bold()
-                        .size(24)
-                );
-            }
-            "li" => {
-                paragraph = paragraph
-                    .add_run(Run::new().add_text("• "))
-                    .add_run(Run::new().add_text(&para_data.text));
-            }
-            "blockquote" => {
-                paragraph = paragraph
-                    .indent(Some(720), None, None, None)
-                    .add_run(Run::new().add_text(&para_data.text).italic());
-            }
-            "code" => {
-                paragraph = paragraph.add_run(
-                    Run::new()
-                        .add_text(&para_data.text)
-                        .fonts(RunFonts::new().ascii("Courier New"))
-                );
-            }
-            _ => {
-                let mut run = Run::new().add_text(&para_data.text);
-                if para_data.bold {
-                    run = run.bold();
-                }
-                if para_data.italic {
-                    run = run.italic();
-                }
-                if para_data.underline {
-                    run = run.underline("single");
-                }
-                paragraph = paragraph.add_run(run);
-            }
-        }
-
-        docx = docx.add_paragraph(paragraph);
-    }
-
-    let mut buf = Cursor::new(Vec::new());
-    docx.build()
-        .pack(&mut buf)
-        .map_err(|e| format!("Failed to build DOCX: {e}"))?;
-
-    Ok(buf.into_inner())
-}
-
-#[derive(Default)]
-struct ParagraphData {
-    text: String,
-    style: String,
-    bold: bool,
-    italic: bool,
-    underline: bool,
-}
-
-fn parse_html_to_paragraphs(html: &str) -> Vec<ParagraphData> {
-    let mut paragraphs = Vec::new();
-    let mut current = ParagraphData::default();
-    let mut in_tag = false;
-    let mut tag_name = String::new();
-    let mut is_closing = false;
-    let mut text_buffer = String::new();
-
-    let mut bold_stack: i32 = 0;
-    let mut italic_stack: i32 = 0;
-    let mut underline_stack: i32 = 0;
-
-    for ch in html.chars() {
-        match ch {
-            '<' => {
-                in_tag = true;
-                tag_name.clear();
-                is_closing = false;
-            }
-            '>' => {
-                in_tag = false;
-                let tag = tag_name.to_lowercase();
-                let tag_trimmed = tag.split_whitespace().next().unwrap_or("");
-
-                if is_closing {
-                    match tag_trimmed {
-                        "p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li" | "blockquote" | "pre" => {
-                            if !text_buffer.is_empty() || !current.text.is_empty() {
-                                current.text = format!("{}{}", current.text, decode_html_entities(&text_buffer));
-                                if !current.text.trim().is_empty() {
-                                    paragraphs.push(current);
-                                }
-                                current = ParagraphData::default();
-                                text_buffer.clear();
-                            }
-                        }
-                        "b" | "strong" => bold_stack = bold_stack.saturating_sub(1),
-                        "i" | "em" => italic_stack = italic_stack.saturating_sub(1),
-                        "u" => underline_stack = underline_stack.saturating_sub(1),
-                        _ => {}
-                    }
-                } else {
-                    match tag_trimmed {
-                        "br" => {
-                            text_buffer.push('\n');
-                        }
-                        "p" | "div" => {
-                            if !text_buffer.is_empty() {
-                                current.text = format!("{}{}", current.text, decode_html_entities(&text_buffer));
-                                text_buffer.clear();
-                            }
-                            current.style = "p".to_string();
-                            current.bold = bold_stack > 0;
-                            current.italic = italic_stack > 0;
-                            current.underline = underline_stack > 0;
-                        }
-                        "h1" => {
-                            current.style = "h1".to_string();
-                        }
-                        "h2" => {
-                            current.style = "h2".to_string();
-                        }
-                        "h3" => {
-                            current.style = "h3".to_string();
-                        }
-                        "li" => {
-                            current.style = "li".to_string();
-                        }
-                        "blockquote" => {
-                            current.style = "blockquote".to_string();
-                        }
-                        "pre" | "code" => {
-                            current.style = "code".to_string();
-                        }
-                        "b" | "strong" => bold_stack += 1,
-                        "i" | "em" => italic_stack += 1,
-                        "u" => underline_stack += 1,
-                        _ => {}
-                    }
-                }
-                tag_name.clear();
-            }
-            '/' if in_tag && tag_name.is_empty() => {
-                is_closing = true;
-            }
-            _ if in_tag => {
-                tag_name.push(ch);
-            }
-            _ => {
-                text_buffer.push(ch);
-            }
-        }
-    }
-
-    if !text_buffer.is_empty() {
-        current.text = format!("{}{}", current.text, decode_html_entities(&text_buffer));
-    }
-    if !current.text.trim().is_empty() {
-        paragraphs.push(current);
-    }
-
-    paragraphs
-}
-
-fn decode_html_entities(text: &str) -> String {
-    text.replace("&nbsp;", " ")
-        .replace("&amp;", "&")
-        .replace("&lt;", "<")
-        .replace("&gt;", ">")
-        .replace("&quot;", "\"")
-        .replace("&#39;", "'")
-        .replace("&apos;", "'")
+pub async fn remove_from_cache(doc_id: &str) {
+    let mut cache = DOCUMENT_CACHE.write().await;
+    cache.remove(doc_id);
 }

 pub async fn load_docx_from_drive(
@ -324,12 +67,13 @@ pub async fn load_docx_from_drive(
        .collect()
        .await
        .map_err(|e| format!("Failed to read DOCX: {e}"))?
-        .into_bytes();
+        .into_bytes()
+        .to_vec();

-    load_docx_from_bytes(&bytes, user_identifier, file_path)
+    load_docx_from_bytes(&bytes, user_identifier, file_path).await
 }

-pub fn load_docx_from_bytes(
+pub async fn load_docx_from_bytes(
    bytes: &[u8],
    user_identifier: &str,
    file_path: &str,
@ -341,11 +85,20 @@ pub fn load_docx_from_bytes(
        .trim_end_matches(".docx")
        .trim_end_matches(".doc");

-    let html_content = convert_docx_to_html(bytes)?;
-    let word_count = count_words(&html_content);
+    let doc_id = generate_doc_id();
+
+    cache_document_bytes(&doc_id, bytes.to_vec()).await;
+
+    let html_content = match load_docx_preserving(bytes) {
+        Ok(ooxml_doc) => {
+            let texts: Vec<String> = ooxml_doc.paragraphs.iter().map(|p| p.text.clone()).collect();
+            paragraphs_to_html(&texts)
+        }
+        Err(_) => convert_docx_to_html(bytes)?,
+    };

    Ok(Document {
-        id: generate_doc_id(),
+        id: doc_id,
        title: file_name.to_string(),
        content: html_content,
        owner_id: user_identifier.to_string(),
@ -358,8 +111,7 @@ pub fn load_docx_from_bytes(
 }

 pub fn convert_docx_to_html(bytes: &[u8]) -> Result<String, String> {
-    let docx = docx_rs::read_docx(bytes)
-        .map_err(|e| format!("Failed to parse DOCX: {e}"))?;
+    let docx = docx_rs::read_docx(bytes).map_err(|e| format!("Failed to parse DOCX: {e}"))?;

    let mut html = String::new();

@ -389,13 +141,9 @@ pub fn convert_docx_to_html(bytes: &[u8]) -> Result<String, String> {
                for content in &para.children {
                    if let docx_rs::ParagraphChild::Run(run) = content {
                        let mut run_text = String::new();
-                        let mut is_bold = false;
-                        let mut is_italic = false;
-                        let mut is_underline = false;
-
-                        is_bold = run.run_property.bold.is_some();
-                        is_italic = run.run_property.italic.is_some();
-                        is_underline = run.run_property.underline.is_some();
+                        let is_bold = run.run_property.bold.is_some();
+                        let is_italic = run.run_property.italic.is_some();
+                        let is_underline = run.run_property.underline.is_some();

                        for child in &run.children {
                            match child {
@ -473,12 +221,157 @@ pub fn convert_docx_to_html(bytes: &[u8]) -> Result<String, String> {
    Ok(html)
 }

-fn escape_html(text: &str) -> String {
-    text.replace('&', "&amp;")
-        .replace('<', "&lt;")
-        .replace('>', "&gt;")
-        .replace('"', "&quot;")
-        .replace('\'', "&#39;")
+pub async fn save_document_as_docx(
+    state: &Arc<AppState>,
+    user_identifier: &str,
+    doc_id: &str,
+    title: &str,
+    content: &str,
+) -> Result<Vec<u8>, String> {
+    let docx_bytes = if let Some(original_bytes) = get_cached_document_bytes(doc_id).await {
+        let paragraphs = html_to_paragraphs(content);
+        update_docx_text(&original_bytes, &paragraphs).unwrap_or_else(|_| {
+            convert_html_to_docx(title, content).unwrap_or_default()
+        })
+    } else {
+        convert_html_to_docx(title, content)?
+    };
+
+    let s3_client = state.drive.as_ref().ok_or("S3 service not available")?;
+    let base_path = get_user_docs_path(user_identifier);
+    let docx_path = format!("{base_path}/{doc_id}.docx");
+
+    s3_client
+        .put_object()
+        .bucket(&state.bucket_name)
+        .key(&docx_path)
+        .body(ByteStream::from(docx_bytes.clone()))
+        .content_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+        .send()
+        .await
+        .map_err(|e| format!("Failed to save DOCX: {e}"))?;
+
+    cache_document_bytes(doc_id, docx_bytes.clone()).await;
+
+    Ok(docx_bytes)
+}
+
+pub fn convert_html_to_docx(title: &str, html_content: &str) -> Result<Vec<u8>, String> {
+    use docx_rs::*;
+
+    let mut docx = Docx::new();
+
+    if !title.is_empty() {
+        let title_para = Paragraph::new().add_run(Run::new().add_text(title).bold().size(48));
+        docx = docx.add_paragraph(title_para);
+        docx = docx.add_paragraph(Paragraph::new());
+    }
+
+    let paragraphs = parse_html_to_paragraphs(html_content);
+    for para_data in paragraphs {
+        let mut paragraph = Paragraph::new();
+
+        match para_data.style.as_str() {
+            "h1" => {
+                paragraph =
+                    paragraph.add_run(Run::new().add_text(&para_data.text).bold().size(32));
+            }
+            "h2" => {
+                paragraph =
+                    paragraph.add_run(Run::new().add_text(&para_data.text).bold().size(28));
+            }
+            "h3" => {
+                paragraph =
+                    paragraph.add_run(Run::new().add_text(&para_data.text).bold().size(24));
+            }
+            "li" => {
+                paragraph = paragraph
+                    .add_run(Run::new().add_text("• "))
+                    .add_run(Run::new().add_text(&para_data.text));
+            }
+            "blockquote" => {
+                paragraph = paragraph
+                    .indent(Some(720), None, None, None)
+                    .add_run(Run::new().add_text(&para_data.text).italic());
+            }
+            "code" => {
+                paragraph = paragraph.add_run(
+                    Run::new()
+                        .add_text(&para_data.text)
+                        .fonts(RunFonts::new().ascii("Courier New")),
+                );
+            }
+            _ => {
+                let mut run = Run::new().add_text(&para_data.text);
+                if para_data.bold {
+                    run = run.bold();
+                }
+                if para_data.italic {
+                    run = run.italic();
+                }
+                if para_data.underline {
+                    run = run.underline("single");
+                }
+                paragraph = paragraph.add_run(run);
+            }
+        }
+
+        docx = docx.add_paragraph(paragraph);
+    }
+
+    let mut buf = Cursor::new(Vec::new());
+    docx.build()
+        .pack(&mut buf)
+        .map_err(|e| format!("Failed to build DOCX: {e}"))?;
+
+    Ok(buf.into_inner())
+}
+
+pub async fn save_document_to_drive(
+    state: &Arc<AppState>,
+    user_identifier: &str,
+    doc_id: &str,
+    title: &str,
+    content: &str,
+) -> Result<String, String> {
+    let s3_client = state.drive.as_ref().ok_or("S3 service not available")?;
+
+    let base_path = get_user_docs_path(user_identifier);
+    let doc_path = format!("{base_path}/{doc_id}.html");
+    let meta_path = format!("{base_path}/{doc_id}.meta.json");
+
+    s3_client
+        .put_object()
+        .bucket(&state.bucket_name)
+        .key(&doc_path)
+        .body(ByteStream::from(content.as_bytes().to_vec()))
+        .content_type("text/html")
+        .send()
+        .await
+        .map_err(|e| format!("Failed to save document: {e}"))?;
+
+    let word_count = count_words(content);
+
+    let metadata = serde_json::json!({
+        "id": doc_id,
+        "title": title,
+        "created_at": Utc::now().to_rfc3339(),
+        "updated_at": Utc::now().to_rfc3339(),
+        "word_count": word_count,
+        "version": 1
+    });
+
+    s3_client
+        .put_object()
+        .bucket(&state.bucket_name)
+        .key(&meta_path)
+        .body(ByteStream::from(metadata.to_string().into_bytes()))
+        .content_type("application/json")
+        .send()
+        .await
+        .map_err(|e| format!("Failed to save metadata: {e}"))?;
+
+    Ok(doc_path)
 }

 pub async fn load_document_from_drive(
@ -489,8 +382,8 @@ pub async fn load_document_from_drive(
    let s3_client = state.drive.as_ref().ok_or("S3 service not available")?;

    let base_path = get_user_docs_path(user_identifier);
-    let doc_path = format!("{}/{}.html", base_path, doc_id);
-    let meta_path = format!("{}/{}.meta.json", base_path, doc_id);
+    let doc_path = format!("{base_path}/{doc_id}.html");
+    let meta_path = format!("{base_path}/{doc_id}.meta.json");

    let content = match s3_client
        .get_object()
@ -564,7 +457,7 @@ pub async fn list_documents_from_drive(
    let s3_client = state.drive.as_ref().ok_or("S3 service not available")?;

    let base_path = get_user_docs_path(user_identifier);
-    let prefix = format!("{}/", base_path);
+    let prefix = format!("{base_path}/");
    let mut documents = Vec::new();

    if let Ok(result) = s3_client
@ -590,10 +483,7 @@ pub async fn list_documents_from_drive(
                                    serde_json::from_str::<serde_json::Value>(&meta_str)
                                {
                                    let doc_meta = DocumentMetadata {
-                                        id: meta["id"]
-                                            .as_str()
-                                            .unwrap_or_default()
-                                            .to_string(),
+                                        id: meta["id"].as_str().unwrap_or_default().to_string(),
                                        title: meta["title"]
                                            .as_str()
                                            .unwrap_or("Untitled")
@ -611,7 +501,7 @@ pub async fn list_documents_from_drive(
                                            .unwrap_or_else(Utc::now),
                                        word_count: meta["word_count"].as_u64().unwrap_or(0)
                                            as usize,
-                                        storage_type: "drive".to_string(),
+                                        storage_type: "html".to_string(),
                                    };
                                    documents.push(doc_meta);
                                }
@ -635,40 +525,28 @@ pub async fn delete_document_from_drive(
    let s3_client = state.drive.as_ref().ok_or("S3 service not available")?;

    let base_path = get_user_docs_path(user_identifier);
-    let doc_path = format!("{}/{}.html", base_path, doc_id);
-    let meta_path = format!("{}/{}.meta.json", base_path, doc_id);
-    let docx_path = format!("{}/{}.docx", base_path, doc_id);

-    let _ = s3_client
-        .delete_object()
-        .bucket(&state.bucket_name)
-        .key(&doc_path)
-        .send()
-        .await;
+    for ext in &[".html", ".docx", ".meta.json"] {
+        let path = format!("{base_path}/{doc_id}{ext}");
+        let _ = s3_client
+            .delete_object()
+            .bucket(&state.bucket_name)
+            .key(&path)
+            .send()
+            .await;
+    }

-    let _ = s3_client
-        .delete_object()
-        .bucket(&state.bucket_name)
-        .key(&meta_path)
-        .send()
-        .await;
-
-    let _ = s3_client
-        .delete_object()
-        .bucket(&state.bucket_name)
-        .key(&docx_path)
-        .send()
-        .await;
+    remove_from_cache(doc_id).await;

    Ok(())
 }

 pub fn create_new_document() -> Document {
-    let id = generate_doc_id();
+    let doc_id = generate_doc_id();
    Document {
-        id: id.clone(),
+        id: doc_id,
        title: "Untitled Document".to_string(),
-        content: String::new(),
+        content: "<p><br></p>".to_string(),
        owner_id: get_current_user_id(),
        storage_path: String::new(),
        created_at: Utc::now(),
@ -706,3 +584,146 @@ fn strip_html(html: &str) -> String {
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
 }
+
+fn escape_html(text: &str) -> String {
+    text.replace('&', "&amp;")
+        .replace('<', "&lt;")
+        .replace('>', "&gt;")
+        .replace('"', "&quot;")
+        .replace('\'', "&#39;")
+}
+
+fn paragraphs_to_html(paragraphs: &[String]) -> String {
+    paragraphs
+        .iter()
+        .map(|p| format!("<p>{}</p>", escape_html(p)))
+        .collect::<Vec<_>>()
+        .join("")
+}
+
+fn html_to_paragraphs(html: &str) -> Vec<String> {
+    parse_html_to_paragraphs(html)
+        .into_iter()
+        .map(|p| p.text)
+        .collect()
+}
+
+#[derive(Default, Clone)]
+struct ParagraphData {
+    text: String,
+    style: String,
+    bold: bool,
+    italic: bool,
+    underline: bool,
+}
+
+fn parse_html_to_paragraphs(html: &str) -> Vec<ParagraphData> {
+    let mut paragraphs = Vec::new();
+    let mut current = ParagraphData::default();
+    let mut in_tag = false;
+    let mut tag_name = String::new();
+    let mut is_closing = false;
+    let mut text_buffer = String::new();
+
+    let mut bold_stack: i32 = 0;
+    let mut italic_stack: i32 = 0;
+    let mut underline_stack: i32 = 0;
+
+    for ch in html.chars() {
+        match ch {
+            '<' => {
+                in_tag = true;
+                tag_name.clear();
+                is_closing = false;
+            }
+            '>' => {
+                in_tag = false;
+                let tag = tag_name.to_lowercase();
+                let tag_trimmed = tag.split_whitespace().next().unwrap_or("");
+
+                if is_closing {
+                    match tag_trimmed {
+                        "p" | "div" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "li"
+                        | "blockquote" | "pre" => {
+                            if !text_buffer.is_empty() || !current.text.is_empty() {
+                                current.text = format!(
+                                    "{}{}",
+                                    current.text,
+                                    decode_html_entities(&text_buffer)
+                                );
+                                if !current.text.trim().is_empty() {
+                                    paragraphs.push(current);
+                                }
+                                current = ParagraphData::default();
+                                text_buffer.clear();
+                            }
+                        }
+                        "b" | "strong" => bold_stack = bold_stack.saturating_sub(1),
+                        "i" | "em" => italic_stack = italic_stack.saturating_sub(1),
+                        "u" => underline_stack = underline_stack.saturating_sub(1),
+                        _ => {}
+                    }
+                } else {
+                    match tag_trimmed {
+                        "br" => {
+                            text_buffer.push('\n');
+                        }
+                        "p" | "div" => {
+                            if !text_buffer.is_empty() {
+                                current.text = format!(
+                                    "{}{}",
+                                    current.text,
+                                    decode_html_entities(&text_buffer)
+                                );
+                                text_buffer.clear();
+                            }
+                            current.style = "p".to_string();
+                            current.bold = bold_stack > 0;
+                            current.italic = italic_stack > 0;
+                            current.underline = underline_stack > 0;
+                        }
+                        "h1" => current.style = "h1".to_string(),
+                        "h2" => current.style = "h2".to_string(),
+                        "h3" => current.style = "h3".to_string(),
+                        "li" => current.style = "li".to_string(),
+                        "blockquote" => current.style = "blockquote".to_string(),
+                        "pre" | "code" => current.style = "code".to_string(),
+                        "b" | "strong" => bold_stack += 1,
+                        "i" | "em" => italic_stack += 1,
+                        "u" => underline_stack += 1,
+                        _ => {}
+                    }
+                }
+                tag_name.clear();
+            }
+            '/' if in_tag && tag_name.is_empty() => {
+                is_closing = true;
+            }
+            _ if in_tag => {
+                tag_name.push(ch);
+            }
+            _ => {
+                text_buffer.push(ch);
+            }
+        }
+    }
+
+    if !text_buffer.is_empty() {
+        current.text = format!("{}{}", current.text, decode_html_entities(&text_buffer));
+    }
+    if !current.text.trim().is_empty() {
+        paragraphs.push(current);
+    }
+
+    paragraphs
+}
+
+fn decode_html_entities(text: &str) -> String {
+    text.replace("&nbsp;", " ")
+        .replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&apos;", "'")
+}
--- a/src/slides/mod.rs
+++ b/src/slides/mod.rs
@ -1,5 +1,6 @@
 pub mod collaboration;
 pub mod handlers;
+pub mod ooxml;
 pub mod storage;
 pub mod types;
 pub mod utils;
--- a/src/slides/ooxml.rs
+++ b/src/slides/ooxml.rs
@ -0,0 +1,259 @@
+use std::io::Cursor;
+
+pub struct OoxmlPresentation {
+    pub original_bytes: Vec<u8>,
+    pub slides: Vec<SlideInfo>,
+}
+
+pub struct SlideInfo {
+    pub index: usize,
+    pub texts: Vec<String>,
+}
+
+pub fn load_pptx_preserving(bytes: &[u8]) -> Result<OoxmlPresentation, String> {
+    use ooxmlsdk::parts::presentation_document::PresentationDocument;
+
+    let reader = Cursor::new(bytes);
+    let pptx = PresentationDocument::new(reader)
+        .map_err(|e| format!("Failed to parse PPTX: {e}"))?;
+
+    let mut slides = Vec::new();
+
+    for (idx, slide_part) in pptx.presentation_part.slide_parts.iter().enumerate() {
+        let xml_str = slide_part.root_element.to_xml().unwrap_or_default();
+
+        let texts = extract_texts_from_slide(&xml_str);
+        slides.push(SlideInfo { index: idx, texts });
+    }
+
+    Ok(OoxmlPresentation {
+        original_bytes: bytes.to_vec(),
+        slides,
+    })
+}
+
+fn extract_texts_from_slide(xml: &str) -> Vec<String> {
+    let mut texts = Vec::new();
+    let mut pos = 0;
+
+    while let Some(p_start) = xml[pos..].find("<a:p") {
+        let abs_start = pos + p_start;
+
+        if let Some(p_end_rel) = xml[abs_start..].find("</a:p>") {
+            let abs_end = abs_start + p_end_rel + 6;
+            let para_content = &xml[abs_start..abs_end];
+
+            let text = extract_text_from_paragraph(para_content);
+            if !text.trim().is_empty() {
+                texts.push(text);
+            }
+            pos = abs_end;
+        } else {
+            break;
+        }
+    }
+
+    texts
+}
+
+fn extract_text_from_paragraph(para_xml: &str) -> String {
+    let mut text = String::new();
+    let mut pos = 0;
+
+    while let Some(t_start) = para_xml[pos..].find("<a:t") {
+        let abs_start = pos + t_start;
+
+        if let Some(tag_end_rel) = para_xml[abs_start..].find('>') {
+            let abs_content_start = abs_start + tag_end_rel + 1;
+
+            if let Some(t_end_rel) = para_xml[abs_content_start..].find("</a:t>") {
+                let content = &para_xml[abs_content_start..abs_content_start + t_end_rel];
+                text.push_str(content);
+                pos = abs_content_start + t_end_rel + 6;
+            } else {
+                break;
+            }
+        } else {
+            break;
+        }
+    }
+
+    unescape_xml(&text)
+}
+
+fn unescape_xml(text: &str) -> String {
+    text.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&apos;", "'")
+}
+
+fn escape_xml(text: &str) -> String {
+    text.replace('&', "&amp;")
+        .replace('<', "&lt;")
+        .replace('>', "&gt;")
+        .replace('"', "&quot;")
+        .replace('\'', "&apos;")
+}
+
+pub fn save_pptx_preserving(original_bytes: &[u8]) -> Result<Vec<u8>, String> {
+    use ooxmlsdk::parts::presentation_document::PresentationDocument;
+
+    let reader = Cursor::new(original_bytes);
+    let pptx = PresentationDocument::new(reader)
+        .map_err(|e| format!("Failed to parse PPTX: {e}"))?;
+
+    let mut output = Cursor::new(Vec::new());
+    pptx.save(&mut output)
+        .map_err(|e| format!("Failed to save PPTX: {e}"))?;
+
+    Ok(output.into_inner())
+}
+
+pub fn update_pptx_text(
+    original_bytes: &[u8],
+    new_slide_texts: &[Vec<String>],
+) -> Result<Vec<u8>, String> {
+    use std::io::{Read, Write};
+    use zip::{write::SimpleFileOptions, ZipArchive, ZipWriter};
+
+    let reader = Cursor::new(original_bytes);
+    let mut archive =
+        ZipArchive::new(reader).map_err(|e| format!("Failed to open PPTX archive: {e}"))?;
+
+    let mut output_buf = Cursor::new(Vec::new());
+    {
+        let mut zip_writer = ZipWriter::new(&mut output_buf);
+        let options =
+            SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated);
+
+        for i in 0..archive.len() {
+            let mut file = archive
+                .by_index(i)
+                .map_err(|e| format!("Failed to read archive entry: {e}"))?;
+
+            let name = file.name().to_string();
+
+            if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
+                let slide_num = extract_slide_number(&name);
+
+                let mut content = String::new();
+                file.read_to_string(&mut content)
+                    .map_err(|e| format!("Failed to read slide xml: {e}"))?;
+
+                let modified_content = if slide_num > 0 && slide_num <= new_slide_texts.len() {
+                    replace_slide_texts(&content, &new_slide_texts[slide_num - 1])
+                } else {
+                    content
+                };
+
+                zip_writer
+                    .start_file(&name, options)
+                    .map_err(|e| format!("Failed to start file in zip: {e}"))?;
+                zip_writer
+                    .write_all(modified_content.as_bytes())
+                    .map_err(|e| format!("Failed to write slide xml: {e}"))?;
+            } else {
+                let mut buf = Vec::new();
+                file.read_to_end(&mut buf)
+                    .map_err(|e| format!("Failed to read file: {e}"))?;
+
+                zip_writer
+                    .start_file(&name, options)
+                    .map_err(|e| format!("Failed to start file in zip: {e}"))?;
+                zip_writer
+                    .write_all(&buf)
+                    .map_err(|e| format!("Failed to write file: {e}"))?;
+            }
+        }
+
+        zip_writer
+            .finish()
+            .map_err(|e| format!("Failed to finish zip: {e}"))?;
+    }
+
+    Ok(output_buf.into_inner())
+}
+
+fn extract_slide_number(filename: &str) -> usize {
+    let name = filename
+        .trim_start_matches("ppt/slides/slide")
+        .trim_end_matches(".xml");
+    name.parse().unwrap_or(0)
+}
+
+fn replace_slide_texts(xml: &str, new_texts: &[String]) -> String {
+    let mut result = xml.to_string();
+    let mut text_idx = 0;
+    let mut search_pos = 0;
+
+    while let Some(p_start) = result[search_pos..]
+        .find("<a:p>")
+        .or_else(|| result[search_pos..].find("<a:p "))
+    {
+        let abs_start = search_pos + p_start;
+
+        if let Some(p_end_rel) = result[abs_start..].find("</a:p>") {
+            let abs_end = abs_start + p_end_rel + 6;
+            let para_content = result[abs_start..abs_end].to_string();
+
+            if para_content.contains("<a:t") {
+                if text_idx < new_texts.len() {
+                    let new_para = replace_first_text_run(&para_content, &new_texts[text_idx]);
+                    let new_len = new_para.len();
+                    result = format!("{}{}{}", &result[..abs_start], new_para, &result[abs_end..]);
+                    search_pos = abs_start + new_len;
+                } else {
+                    search_pos = abs_end;
+                }
+                text_idx += 1;
+            } else {
+                search_pos = abs_end;
+            }
+        } else {
+            break;
+        }
+    }
+
+    result
+}
+
+fn replace_first_text_run(para_xml: &str, new_text: &str) -> String {
+    let mut result = para_xml.to_string();
+    let mut found_first = false;
+
+    let mut search_pos = 0;
+    while let Some(t_start) = result[search_pos..].find("<a:t") {
+        let abs_start = search_pos + t_start;
+
+        if let Some(tag_end_rel) = result[abs_start..].find('>') {
+            let abs_content_start = abs_start + tag_end_rel + 1;
+
+            if let Some(t_end_rel) = result[abs_content_start..].find("</a:t>") {
+                let abs_content_end = abs_content_start + t_end_rel;
+
+                if !found_first {
+                    let escaped = escape_xml(new_text);
+                    result = format!(
+                        "{}{}{}",
+                        &result[..abs_content_start],
+                        escaped,
+                        &result[abs_content_end..]
+                    );
+                    found_first = true;
+                    search_pos = abs_content_start + escaped.len() + 6;
+                } else {
+                    result = format!("{}{}", &result[..abs_content_start], &result[abs_content_end..]);
+                    search_pos = abs_content_start;
+                }
+            } else {
+                break;
+            }
+        } else {
+            break;
+        }
+    }
+
+    result
+}
--- a/src/slides/storage.rs
+++ b/src/slides/storage.rs
@ -1,18 +1,26 @@
 use crate::shared::state::AppState;
+use crate::slides::ooxml::{load_pptx_preserving, update_pptx_text};
 use crate::slides::types::{
    ElementContent, ElementStyle, Presentation, PresentationMetadata, Slide,
    SlideBackground, SlideElement,
 };
 use crate::slides::utils::{create_content_slide, create_default_theme, create_title_slide};
-use chrono::Utc;
+use chrono::{DateTime, Utc};
+use std::collections::HashMap;
 use std::io::{Cursor, Read, Write};
 use std::sync::Arc;
+use tokio::sync::RwLock;
 use uuid::Uuid;
 use zip::write::SimpleFileOptions;
 use zip::{ZipArchive, ZipWriter};

+static PRESENTATION_CACHE: once_cell::sync::Lazy<RwLock<HashMap<String, (Vec<u8>, DateTime<Utc>)>>> =
+    once_cell::sync::Lazy::new(|| RwLock::new(HashMap::new()));
+
+const CACHE_TTL_SECS: i64 = 3600;
+
 pub fn get_user_presentations_path(user_id: &str) -> String {
-    format!("users/{}/presentations", user_id)
+    format!("users/{user_id}/presentations")
 }

 pub fn get_current_user_id() -> String {
@ -23,10 +31,28 @@ pub fn generate_presentation_id() -> String {
    Uuid::new_v4().to_string()
 }

+pub async fn cache_presentation_bytes(pres_id: &str, bytes: Vec<u8>) {
+    let mut cache = PRESENTATION_CACHE.write().await;
+    cache.insert(pres_id.to_string(), (bytes, Utc::now()));
+
+    let now = Utc::now();
+    cache.retain(|_, (_, modified)| (now - *modified).num_seconds() < CACHE_TTL_SECS);
+}
+
+pub async fn get_cached_presentation_bytes(pres_id: &str) -> Option<Vec<u8>> {
+    let cache = PRESENTATION_CACHE.read().await;
+    cache.get(pres_id).map(|(bytes, _)| bytes.clone())
+}
+
+pub async fn remove_from_cache(pres_id: &str) {
+    let mut cache = PRESENTATION_CACHE.write().await;
+    cache.remove(pres_id);
+}
+
 fn extract_id_from_path(path: &str) -> String {
    path.split('/')
        .last()
-        .unwrap_or("")
+        .unwrap_or_default()
        .trim_end_matches(".json")
        .trim_end_matches(".pptx")
        .to_string()
@ -68,7 +94,22 @@ pub async fn save_presentation_as_pptx(
    user_id: &str,
    presentation: &Presentation,
 ) -> Result<Vec<u8>, String> {
-    let pptx_bytes = convert_to_pptx(presentation)?;
+    let pptx_bytes = if let Some(original_bytes) = get_cached_presentation_bytes(&presentation.id).await {
+        let slide_texts: Vec<Vec<String>> = presentation.slides.iter().map(|slide| {
+            slide.elements.iter().filter_map(|el| {
+                if let ElementContent::Text { text, .. } = &el.content {
+                    Some(text.clone())
+                } else {
+                    None
+                }
+            }).collect()
+        }).collect();
+        update_pptx_text(&original_bytes, &slide_texts).unwrap_or_else(|_| {
+            convert_to_pptx(presentation).unwrap_or_default()
+        })
+    } else {
+        convert_to_pptx(presentation)?
+    };

    let drive = state
        .drive
@ -484,12 +525,13 @@ pub async fn load_pptx_from_drive(
        .collect()
        .await
        .map_err(|e| format!("Failed to read PPTX: {e}"))?
-        .into_bytes();
+        .into_bytes()
+        .to_vec();

-    load_pptx_from_bytes(&bytes, user_id, file_path)
+    load_pptx_from_bytes(&bytes, user_id, file_path).await
 }

-pub fn load_pptx_from_bytes(
+pub async fn load_pptx_from_bytes(
    bytes: &[u8],
    user_id: &str,
    file_path: &str,
@ -505,6 +547,10 @@ pub fn load_pptx_from_bytes(
        .trim_end_matches(".pptx")
        .trim_end_matches(".ppt");

+    let pres_id = generate_presentation_id();
+
+    cache_presentation_bytes(&pres_id, bytes.to_vec()).await;
+
    let mut slides = Vec::new();
    let mut slide_num = 1;

@ -528,7 +574,7 @@ pub fn load_pptx_from_bytes(
    }

    Ok(Presentation {
-        id: generate_presentation_id(),
+        id: pres_id,
        name: file_name.to_string(),
        owner_id: user_id.to_string(),
        slides,