diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 19ade96..434c869 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -54,34 +54,34 @@ class DocxConverter(HtmlConverter): return True return False - + def _sanitize_filename(self, filename: str) -> str: """ Sanitize a filename by removing or replacing problematic characters. - + Args: filename: The original filename - + Returns: A sanitized filename safe for filesystem use """ # Step 1: Normalize unicode characters - filename = unicodedata.normalize('NFKD', filename) - + filename = unicodedata.normalize("NFKD", filename) + # Step 2: Remove invalid characters and replace spaces with underscores # Keep alphanumeric characters, underscores, hyphens, and periods - sanitized = re.sub(r'[^\w\-\.]', '_', filename) - + sanitized = re.sub(r"[^\w\-\.]", "_", filename) + # Step 3: Collapse multiple underscores - sanitized = re.sub(r'_+', '_', sanitized) - + sanitized = re.sub(r"_+", "_", sanitized) + # Step 4: Remove leading/trailing underscores - sanitized = sanitized.strip('_') - + sanitized = sanitized.strip("_") + # Step 5: Ensure we have a valid filename (default if empty) if not sanitized: sanitized = "unnamed" - + return sanitized def _get_document_name(self, stream_info: StreamInfo) -> str: @@ -94,21 +94,21 @@ class DocxConverter(HtmlConverter): name, _ = os.path.splitext(basename) if name: return self._sanitize_filename(name) - + # If local_path exists, try to extract from local path if stream_info.local_path: basename = os.path.basename(stream_info.local_path) name, _ = os.path.splitext(basename) if name: return self._sanitize_filename(name) - + # If URL exists, try to extract from URL if stream_info.url: basename = os.path.basename(stream_info.url) name, _ = os.path.splitext(basename) if name: return self._sanitize_filename(name) - + # Default name return "docx_document" @@ -117,7 +117,7 @@ class DocxConverter(HtmlConverter): file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter - ) -> DocumentConverterResult: + ) -> DocumentConverterResult: # Check dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -139,20 +139,22 @@ class DocxConverter(HtmlConverter): style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) - + # Convert to HTML and pass necessary parameters to HTML converter - html_content = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value - + html_content = mammoth.convert_to_html( + pre_process_stream, style_map=style_map + ).value + # Create new StreamInfo to pass to HTML converter html_stream_info = stream_info.copy_and_update( - mimetype="text/html", - extension=".html" + mimetype="text/html", extension=".html" ) - + # Use io.BytesIO to create binary stream from io import BytesIO + return self._html_converter.convert( file_stream=BytesIO(html_content.encode("utf-8")), stream_info=html_stream_info, **kwargs, - ) \ No newline at end of file + ) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 914a38a..a2faa85 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -27,7 +27,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): # Apply basic options options["heading_style"] = options.get("heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) - + # Initialize parent class super().__init__(**options) @@ -107,62 +107,66 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" - + # If in inline mode and not preserved, return alt text - if ( - convert_as_inline - and el.parent.name not in self.options.get("keep_inline_images_in", []) + if convert_as_inline and el.parent.name not in self.options.get( + "keep_inline_images_in", [] ): return alt # Process data URI format images - if src.startswith("data:image") and not self.options.get("keep_data_uris", False): + if src.startswith("data:image") and not self.options.get( + "keep_data_uris", False + ): try: # Parse MIME type mime_type = src.split(";")[0].replace("data:", "") - + # Get file extension ext = { "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg", - "image/gif": ".gif" + "image/gif": ".gif", }.get(mime_type, ".png") - + # Decode base64 data encoded = src.split(",")[1] image_data = base64.b64decode(encoded) - + # Generate unique filename hashname = hashlib.sha256(image_data).hexdigest()[:8] filename = f"image_{hashname}{ext}" - + # Determine output directory - if hasattr(self, 'conversion_name') and self.conversion_name: + if hasattr(self, "conversion_name") and self.conversion_name: # If conversion_name exists, create subfolder - output_dir = os.path.join(self.image_output_dir, self.conversion_name) + output_dir = os.path.join( + self.image_output_dir, self.conversion_name + ) else: # Otherwise use base directory output_dir = self.image_output_dir - + # Ensure directory exists os.makedirs(output_dir, exist_ok=True) - + # Save image file filepath = os.path.join(output_dir, filename) with open(filepath, "wb") as f: f.write(image_data) - + # Update src to relative path src = os.path.join(output_dir, filename).replace("\\", "/") # If alt text is empty, use the image filename (without extension) as alt text if not alt: alt = f"image_{hashname}" - + except Exception as e: error_msg = f"Error saving image: {str(e)}" import traceback + traceback.print_exc(file=sys.stderr) # If extraction fails, revert to original truncating behavior src = src.split(",")[0] + "..."