pre-commit

2025-04-30 15:21:41 +12:00 · 2025-04-30 15:21:41 +12:00 · d8a8cda4db
commit d8a8cda4db
parent 92f427477a
2 changed files with 46 additions and 40 deletions
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -54,34 +54,34 @@ class DocxConverter(HtmlConverter):
                return True

        return False
-    
+
    def _sanitize_filename(self, filename: str) -> str:
        """
        Sanitize a filename by removing or replacing problematic characters.
-        
+
        Args:
            filename: The original filename
-            
+
        Returns:
            A sanitized filename safe for filesystem use
        """
        # Step 1: Normalize unicode characters
-        filename = unicodedata.normalize('NFKD', filename)
-        
+        filename = unicodedata.normalize("NFKD", filename)
+
        # Step 2: Remove invalid characters and replace spaces with underscores
        # Keep alphanumeric characters, underscores, hyphens, and periods
-        sanitized = re.sub(r'[^\w\-\.]', '_', filename)
-        
+        sanitized = re.sub(r"[^\w\-\.]", "_", filename)
+
        # Step 3: Collapse multiple underscores
-        sanitized = re.sub(r'_+', '_', sanitized)
-        
+        sanitized = re.sub(r"_+", "_", sanitized)
+
        # Step 4: Remove leading/trailing underscores
-        sanitized = sanitized.strip('_')
-        
+        sanitized = sanitized.strip("_")
+
        # Step 5: Ensure we have a valid filename (default if empty)
        if not sanitized:
            sanitized = "unnamed"
-        
+
        return sanitized

    def _get_document_name(self, stream_info: StreamInfo) -> str:
@ -94,21 +94,21 @@ class DocxConverter(HtmlConverter):
            name, _ = os.path.splitext(basename)
            if name:
                return self._sanitize_filename(name)
-        
+
        # If local_path exists, try to extract from local path
        if stream_info.local_path:
            basename = os.path.basename(stream_info.local_path)
            name, _ = os.path.splitext(basename)
            if name:
                return self._sanitize_filename(name)
-                
+
        # If URL exists, try to extract from URL
        if stream_info.url:
            basename = os.path.basename(stream_info.url)
            name, _ = os.path.splitext(basename)
            if name:
                return self._sanitize_filename(name)
-        
+
        # Default name
        return "docx_document"

@ -117,7 +117,7 @@ class DocxConverter(HtmlConverter):
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
-    ) -> DocumentConverterResult:     
+    ) -> DocumentConverterResult:
        # Check dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
@ -139,20 +139,22 @@ class DocxConverter(HtmlConverter):

        style_map = kwargs.get("style_map", None)
        pre_process_stream = pre_process_docx(file_stream)
-        
+
        # Convert to HTML and pass necessary parameters to HTML converter
-        html_content = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
-        
+        html_content = mammoth.convert_to_html(
+            pre_process_stream, style_map=style_map
+        ).value
+
        # Create new StreamInfo to pass to HTML converter
        html_stream_info = stream_info.copy_and_update(
-            mimetype="text/html",
-            extension=".html"
+            mimetype="text/html", extension=".html"
        )
-        
+
        # Use io.BytesIO to create binary stream
        from io import BytesIO
+
        return self._html_converter.convert(
            file_stream=BytesIO(html_content.encode("utf-8")),
            stream_info=html_stream_info,
            **kwargs,
-        )
+        )
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@ -27,7 +27,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        # Apply basic options
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
        options["keep_data_uris"] = options.get("keep_data_uris", False)
-        
+
        # Initialize parent class
        super().__init__(**options)

@ -107,62 +107,66 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
        title = el.attrs.get("title", None) or ""
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        
+
        # If in inline mode and not preserved, return alt text
-        if (
-            convert_as_inline
-            and el.parent.name not in self.options.get("keep_inline_images_in", [])
+        if convert_as_inline and el.parent.name not in self.options.get(
+            "keep_inline_images_in", []
        ):
            return alt

        # Process data URI format images
-        if src.startswith("data:image") and not self.options.get("keep_data_uris", False):
+        if src.startswith("data:image") and not self.options.get(
+            "keep_data_uris", False
+        ):
            try:
                # Parse MIME type
                mime_type = src.split(";")[0].replace("data:", "")
-                
+
                # Get file extension
                ext = {
                    "image/png": ".png",
                    "image/jpeg": ".jpg",
                    "image/jpg": ".jpg",
-                    "image/gif": ".gif"
+                    "image/gif": ".gif",
                }.get(mime_type, ".png")
-                
+
                # Decode base64 data
                encoded = src.split(",")[1]
                image_data = base64.b64decode(encoded)
-                
+
                # Generate unique filename
                hashname = hashlib.sha256(image_data).hexdigest()[:8]
                filename = f"image_{hashname}{ext}"
-                
+
                # Determine output directory
-                if hasattr(self, 'conversion_name') and self.conversion_name:
+                if hasattr(self, "conversion_name") and self.conversion_name:
                    # If conversion_name exists, create subfolder
-                    output_dir = os.path.join(self.image_output_dir, self.conversion_name)
+                    output_dir = os.path.join(
+                        self.image_output_dir, self.conversion_name
+                    )
                else:
                    # Otherwise use base directory
                    output_dir = self.image_output_dir
-                
+
                # Ensure directory exists
                os.makedirs(output_dir, exist_ok=True)
-                
+
                # Save image file
                filepath = os.path.join(output_dir, filename)
                with open(filepath, "wb") as f:
                    f.write(image_data)
-                
+
                # Update src to relative path
                src = os.path.join(output_dir, filename).replace("\\", "/")

                # If alt text is empty, use the image filename (without extension) as alt text
                if not alt:
                    alt = f"image_{hashname}"
-                
+
            except Exception as e:
                error_msg = f"Error saving image: {str(e)}"
                import traceback
+
                traceback.print_exc(file=sys.stderr)
                # If extraction fails, revert to original truncating behavior
                src = src.split(",")[0] + "..."