From 17081dae64a8ad5cf55044e09111bcbfa2d879b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=98=8A=E5=A4=A9?= Date: Wed, 30 Apr 2025 17:03:10 +1200 Subject: [PATCH] supportfordata-src --- .../markitdown/converters/_docx_converter.py | 90 ++----------------- .../src/markitdown/converters/_markdownify.py | 82 ++--------------- 2 files changed, 13 insertions(+), 159 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 19ade96..b320695 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,9 +1,6 @@ import sys -import os -import re -import unicodedata + from typing import BinaryIO, Any -from io import BytesIO from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx @@ -54,71 +51,14 @@ class DocxConverter(HtmlConverter): return True return False - - def _sanitize_filename(self, filename: str) -> str: - """ - Sanitize a filename by removing or replacing problematic characters. - - Args: - filename: The original filename - - Returns: - A sanitized filename safe for filesystem use - """ - # Step 1: Normalize unicode characters - filename = unicodedata.normalize('NFKD', filename) - - # Step 2: Remove invalid characters and replace spaces with underscores - # Keep alphanumeric characters, underscores, hyphens, and periods - sanitized = re.sub(r'[^\w\-\.]', '_', filename) - - # Step 3: Collapse multiple underscores - sanitized = re.sub(r'_+', '_', sanitized) - - # Step 4: Remove leading/trailing underscores - sanitized = sanitized.strip('_') - - # Step 5: Ensure we have a valid filename (default if empty) - if not sanitized: - sanitized = "unnamed" - - return sanitized - - def _get_document_name(self, stream_info: StreamInfo) -> str: - """ - Extract document name from StreamInfo - """ - # First try to extract from filename attribute - if stream_info.filename: - basename = os.path.basename(stream_info.filename) - name, _ = os.path.splitext(basename) - if name: - return self._sanitize_filename(name) - - # If local_path exists, try to extract from local path - if stream_info.local_path: - basename = os.path.basename(stream_info.local_path) - name, _ = os.path.splitext(basename) - if name: - return self._sanitize_filename(name) - - # If URL exists, try to extract from URL - if stream_info.url: - basename = os.path.basename(stream_info.url) - name, _ = os.path.splitext(basename) - if name: - return self._sanitize_filename(name) - - # Default name - return "docx_document" def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter - ) -> DocumentConverterResult: - # Check dependencies + ) -> DocumentConverterResult: + # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( @@ -132,27 +72,9 @@ class DocxConverter(HtmlConverter): _dependency_exc_info[2] ) - # If conversion_name not explicitly provided, try to extract from stream_info - if "conversion_name" not in kwargs: - conversion_name = self._get_document_name(stream_info) - kwargs["conversion_name"] = conversion_name - style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) - - # Convert to HTML and pass necessary parameters to HTML converter - html_content = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value - - # Create new StreamInfo to pass to HTML converter - html_stream_info = stream_info.copy_and_update( - mimetype="text/html", - extension=".html" - ) - - # Use io.BytesIO to create binary stream - from io import BytesIO - return self._html_converter.convert( - file_stream=BytesIO(html_content.encode("utf-8")), - stream_info=html_stream_info, + return self._html_converter.convert_string( + mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs, - ) \ No newline at end of file + ) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 914a38a..ee78541 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -1,9 +1,5 @@ import re import markdownify -import os -import base64 -import hashlib -import sys from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse @@ -20,15 +16,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): """ def __init__(self, **options: Any): - # Set default values for image-related options - self.image_output_dir = options.get("image_output_dir", "assets") - self.conversion_name = options.get("conversion_name") - - # Apply basic options options["heading_style"] = options.get("heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) - - # Initialize parent class + # Explicitly cast options to the expected type if necessary super().__init__(**options) def convert_hn( @@ -99,81 +89,23 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): convert_as_inline: Optional[bool] = False, **kwargs, ) -> str: - """ - Process image elements, save data URI format images to filesystem - Supports categorized storage in subfolders by document name - """ + """Same as usual converter, but removes data URIs""" + alt = el.attrs.get("alt", None) or "" src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" - - # If in inline mode and not preserved, return alt text if ( convert_as_inline - and el.parent.name not in self.options.get("keep_inline_images_in", []) + and el.parent.name not in self.options["keep_inline_images_in"] ): return alt - # Process data URI format images - if src.startswith("data:image") and not self.options.get("keep_data_uris", False): - try: - # Parse MIME type - mime_type = src.split(";")[0].replace("data:", "") - - # Get file extension - ext = { - "image/png": ".png", - "image/jpeg": ".jpg", - "image/jpg": ".jpg", - "image/gif": ".gif" - }.get(mime_type, ".png") - - # Decode base64 data - encoded = src.split(",")[1] - image_data = base64.b64decode(encoded) - - # Generate unique filename - hashname = hashlib.sha256(image_data).hexdigest()[:8] - filename = f"image_{hashname}{ext}" - - # Determine output directory - if hasattr(self, 'conversion_name') and self.conversion_name: - # If conversion_name exists, create subfolder - output_dir = os.path.join(self.image_output_dir, self.conversion_name) - else: - # Otherwise use base directory - output_dir = self.image_output_dir - - # Ensure directory exists - os.makedirs(output_dir, exist_ok=True) - - # Save image file - filepath = os.path.join(output_dir, filename) - with open(filepath, "wb") as f: - f.write(image_data) - - # Update src to relative path - src = os.path.join(output_dir, filename).replace("\\", "/") - - # If alt text is empty, use the image filename (without extension) as alt text - if not alt: - alt = f"image_{hashname}" - - except Exception as e: - error_msg = f"Error saving image: {str(e)}" - import traceback - traceback.print_exc(file=sys.stderr) - # If extraction fails, revert to original truncating behavior - src = src.split(",")[0] + "..." - return f"![{alt}](image_error.png) " - - # Process other data URIs that are not images (truncate them) - elif src.startswith("data:") and not self.options.get("keep_data_uris", False): + # Remove dataURIs + if src.startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." - # Return Markdown format image reference - return f"![{alt}]({src}{title_part})" + return "![%s](%s%s)" % (alt, src, title_part) def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore