diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 19ade96..1fcff58 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,9 +1,12 @@ import sys import os import re -import unicodedata -from typing import BinaryIO, Any +import base64 +import hashlib +from typing import BinaryIO, Any, Dict, List, Tuple from io import BytesIO +import json +from bs4 import BeautifulSoup from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx @@ -30,7 +33,8 @@ ACCEPTED_FILE_EXTENSIONS = [".docx"] class DocxConverter(HtmlConverter): """ - Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. + Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible. + Extracts images from documents and saves them to document-specific subfolders. """ def __init__(self): @@ -54,70 +58,113 @@ class DocxConverter(HtmlConverter): return True return False - - def _sanitize_filename(self, filename: str) -> str: - """ - Sanitize a filename by removing or replacing problematic characters. - - Args: - filename: The original filename - - Returns: - A sanitized filename safe for filesystem use - """ - # Step 1: Normalize unicode characters - filename = unicodedata.normalize('NFKD', filename) - - # Step 2: Remove invalid characters and replace spaces with underscores - # Keep alphanumeric characters, underscores, hyphens, and periods - sanitized = re.sub(r'[^\w\-\.]', '_', filename) - - # Step 3: Collapse multiple underscores - sanitized = re.sub(r'_+', '_', sanitized) - - # Step 4: Remove leading/trailing underscores - sanitized = sanitized.strip('_') - - # Step 5: Ensure we have a valid filename (default if empty) - if not sanitized: - sanitized = "unnamed" - - return sanitized def _get_document_name(self, stream_info: StreamInfo) -> str: """ - Extract document name from StreamInfo + Extract document name from StreamInfo and sanitize it """ # First try to extract from filename attribute if stream_info.filename: basename = os.path.basename(stream_info.filename) name, _ = os.path.splitext(basename) if name: - return self._sanitize_filename(name) - + return name + # If local_path exists, try to extract from local path if stream_info.local_path: basename = os.path.basename(stream_info.local_path) name, _ = os.path.splitext(basename) if name: - return self._sanitize_filename(name) - + return name + # If URL exists, try to extract from URL if stream_info.url: basename = os.path.basename(stream_info.url) name, _ = os.path.splitext(basename) if name: - return self._sanitize_filename(name) - + print(f"[DEBUG] Extracted document name from URL: {name}") + return name + # Default name return "docx_document" + def _extract_and_save_images( + self, html_content: str, doc_folder: str, assets_folder: str = "assets" + ) -> str: + """ + Extract base64 images from HTML content, save them to filesystem, and update HTML with new image paths + + Args: + html_content: The HTML content containing images + doc_folder: The document-specific folder name + assets_folder: The base folder for assets + + Returns: + Updated HTML content with image references pointing to saved files + """ + # Parse HTML + soup = BeautifulSoup(html_content, "html.parser") + + # Find all images + images = soup.find_all("img") + if not images: + return html_content + + # Create output directory + output_dir = os.path.join(assets_folder, doc_folder) + os.makedirs(output_dir, exist_ok=True) + + # Process each image + for img in images: + src = img.get("src", "") or img.get("data-src", "") + if not src or not src.startswith("data:image"): + continue + + try: + # Parse image data + mime_type = src.split(";")[0].replace("data:", "") + + # Get file extension + ext = { + "image/png": ".png", + "image/jpeg": ".jpg", + "image/jpg": ".jpg", + "image/gif": ".gif", + }.get(mime_type, ".png") + + # Extract base64 data + encoded_data = src.split(",", 1)[1] + image_data = base64.b64decode(encoded_data) + + # Generate unique filename + hashname = hashlib.sha256(image_data).hexdigest()[:8] + filename = f"image_{hashname}{ext}" + + # Save file + filepath = os.path.join(output_dir, filename) + with open(filepath, "wb") as f: + f.write(image_data) + + # Update image src in HTML + new_src = os.path.join(output_dir, filename).replace("\\", "/") + img["src"] = new_src + + # Add alt text if empty + if not img.get("alt"): + img["alt"] = f"image_{hashname}" + + except Exception as e: + continue + + # Return updated HTML + return str(soup) + def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter - ) -> DocumentConverterResult: + ) -> DocumentConverterResult: # Check dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -132,27 +179,37 @@ class DocxConverter(HtmlConverter): _dependency_exc_info[2] ) - # If conversion_name not explicitly provided, try to extract from stream_info - if "conversion_name" not in kwargs: - conversion_name = self._get_document_name(stream_info) - kwargs["conversion_name"] = conversion_name + # Get document name + doc_name = kwargs.get("conversion_name") or self._get_document_name(stream_info) + if hasattr(self, "sanitize_filename"): + doc_name = self.sanitize_filename(doc_name) + # Get assets folder + assets_folder = kwargs.get("image_output_dir", "assets") + + # Convert DOCX to HTML style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) - - # Convert to HTML and pass necessary parameters to HTML converter - html_content = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value - - # Create new StreamInfo to pass to HTML converter - html_stream_info = stream_info.copy_and_update( - mimetype="text/html", - extension=".html" + html_content = mammoth.convert_to_html( + pre_process_stream, style_map=style_map + ).value + + # Extract and save images, getting updated HTML with correct image references + processed_html = self._extract_and_save_images( + html_content, doc_name, assets_folder ) - - # Use io.BytesIO to create binary stream - from io import BytesIO + + # Create a new StreamInfo for the HTML converter + html_stream_info = stream_info.copy_and_update( + mimetype="text/html", extension=".html" + ) + + # Use the standard HTML converter to convert to Markdown + # We don't need to pass conversion_name because images are already extracted + html_kwargs = {k: v for k, v in kwargs.items() if k != "conversion_name"} + return self._html_converter.convert( - file_stream=BytesIO(html_content.encode("utf-8")), + file_stream=BytesIO(processed_html.encode("utf-8")), stream_info=html_stream_info, - **kwargs, - ) \ No newline at end of file + **html_kwargs, + ) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 914a38a..d98bdfb 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -1,9 +1,5 @@ import re import markdownify -import os -import base64 -import hashlib -import sys from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse @@ -20,15 +16,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): """ def __init__(self, **options: Any): - # Set default values for image-related options - self.image_output_dir = options.get("image_output_dir", "assets") - self.conversion_name = options.get("conversion_name") - - # Apply basic options options["heading_style"] = options.get("heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) - - # Initialize parent class + # Explicitly cast options to the expected type if necessary super().__init__(**options) def convert_hn( @@ -99,81 +89,23 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): convert_as_inline: Optional[bool] = False, **kwargs, ) -> str: - """ - Process image elements, save data URI format images to filesystem - Supports categorized storage in subfolders by document name - """ + """Same as usual converter, but removes data URIs""" + alt = el.attrs.get("alt", None) or "" - src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or "" + src = el.attrs.get("src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" - - # If in inline mode and not preserved, return alt text if ( convert_as_inline - and el.parent.name not in self.options.get("keep_inline_images_in", []) + and el.parent.name not in self.options["keep_inline_images_in"] ): return alt - # Process data URI format images - if src.startswith("data:image") and not self.options.get("keep_data_uris", False): - try: - # Parse MIME type - mime_type = src.split(";")[0].replace("data:", "") - - # Get file extension - ext = { - "image/png": ".png", - "image/jpeg": ".jpg", - "image/jpg": ".jpg", - "image/gif": ".gif" - }.get(mime_type, ".png") - - # Decode base64 data - encoded = src.split(",")[1] - image_data = base64.b64decode(encoded) - - # Generate unique filename - hashname = hashlib.sha256(image_data).hexdigest()[:8] - filename = f"image_{hashname}{ext}" - - # Determine output directory - if hasattr(self, 'conversion_name') and self.conversion_name: - # If conversion_name exists, create subfolder - output_dir = os.path.join(self.image_output_dir, self.conversion_name) - else: - # Otherwise use base directory - output_dir = self.image_output_dir - - # Ensure directory exists - os.makedirs(output_dir, exist_ok=True) - - # Save image file - filepath = os.path.join(output_dir, filename) - with open(filepath, "wb") as f: - f.write(image_data) - - # Update src to relative path - src = os.path.join(output_dir, filename).replace("\\", "/") - - # If alt text is empty, use the image filename (without extension) as alt text - if not alt: - alt = f"image_{hashname}" - - except Exception as e: - error_msg = f"Error saving image: {str(e)}" - import traceback - traceback.print_exc(file=sys.stderr) - # If extraction fails, revert to original truncating behavior - src = src.split(",")[0] + "..." - return f"![{alt}](image_error.png) " - - # Process other data URIs that are not images (truncate them) - elif src.startswith("data:") and not self.options.get("keep_data_uris", False): + # Remove dataURIs + if src.startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." - # Return Markdown format image reference - return f"![{alt}]({src}{title_part})" + return "![%s](%s%s)" % (alt, src, title_part) def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore