diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 1656eaf..d6f550a 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -702,19 +702,57 @@ class DocxConverter(HtmlConverter): """ def sanitize_filename(self, name: str) -> str: - """Sanitizes a string to make it a valid file name.""" - # Normalize whitespace - name = re.sub(r"\s+", " ", name.strip()) + """Sanitizes a string to make it a valid file name across different operating systems.""" + # Normalize underscore + name = re.sub(r"\s+", "_", name.strip()) + # Replace invalid characters with underscores - return re.sub(r'[\\/*?:"<>|]', "_", name) + name = re.sub(r'[\\/*?:"<>|]', "_", name) + + # Remove leading and trailing dots and spaces + name = name.strip(" .") + + # Limit the length of the filename to a reasonable length (e.g., 251 characters) + max_length = 251 + if len(name) > max_length: + name = name[:max_length] + + return name + + def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str: + """Truncates the filename to ensure the final length is within the limit.""" + max_base_length = max_length - len(extension) + if len(name) > max_base_length: + return name[:max_base_length] + return name + + def unique_filename(self, base_path: str, max_length: int = 251) -> str: + """Generates a unique filename while ensuring it stays within the length limit.""" + base, ext = os.path.splitext(base_path) + truncated_base = self.truncate_filename(base, max_length, ext) + + counter = 1 + unique_path = f"{truncated_base}{ext}" + while os.path.exists(unique_path): + suffix = f"_{counter}" + # Ensure base is short enough to add the suffix + truncated_base = self.truncate_filename(base, max_length - len(suffix) - len(ext)) + unique_path = f"{truncated_base}{suffix}{ext}" + counter += 1 + + return unique_path def convert_image(self, image, output_dir: str) -> dict: - """Handles image extraction and saving.""" + """Handles image extraction and saving with collision avoidance and length limits.""" os.makedirs(output_dir, exist_ok=True) raw_name = image.alt_text or f"image_{hash(image)}" - image_name = self.sanitize_filename(raw_name) + ".png" - image_path = os.path.join(output_dir, image_name) + sanitized_name = self.sanitize_filename(raw_name) + truncated_name = self.truncate_filename(sanitized_name, 251, ".png") + image_path = os.path.join(output_dir, truncated_name + ".png") + + # Ensure unique filename + image_path = self.unique_filename(image_path) try: with image.open() as image_bytes: diff --git a/tests/test_files/test_with_images.docx b/tests/test_files/test_with_images.docx new file mode 100644 index 0000000..fe34fc6 Binary files /dev/null and b/tests/test_files/test_with_images.docx differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 3333bcb..f32022a 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -89,6 +89,16 @@ DOCX_COMMENT_TEST_STRINGS = [ "Yet another comment in the doc. 55yiyi-asd09", ] +DOCX_IMAGES_TEST_STRINGS = [ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png" +] + PPTX_TEST_STRINGS = [ "2cdda5c8-e50e-4db4-b5f0-9722a649f455", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", @@ -206,6 +216,10 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) validate_strings(result, DOCX_TEST_STRINGS) + # Test DOCX processing, with images + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_with_images.docx")) + validate_strings(result, DOCX_IMAGES_TEST_STRINGS) + # Test DOCX processing, with comments result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),