From 1b0d1491bed01948e025a1899b6938583c15c72f Mon Sep 17 00:00:00 2001 From: Mauro Druwel Date: Sat, 4 Jan 2025 12:54:03 +0100 Subject: [PATCH] Pre-commit --- src/markitdown/_markitdown.py | 12 +++++++----- tests/test_markitdown.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index d6f550a..bf1900c 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -705,18 +705,18 @@ class DocxConverter(HtmlConverter): """Sanitizes a string to make it a valid file name across different operating systems.""" # Normalize underscore name = re.sub(r"\s+", "_", name.strip()) - + # Replace invalid characters with underscores name = re.sub(r'[\\/*?:"<>|]', "_", name) - + # Remove leading and trailing dots and spaces name = name.strip(" .") - + # Limit the length of the filename to a reasonable length (e.g., 251 characters) max_length = 251 if len(name) > max_length: name = name[:max_length] - + return name def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str: @@ -736,7 +736,9 @@ class DocxConverter(HtmlConverter): while os.path.exists(unique_path): suffix = f"_{counter}" # Ensure base is short enough to add the suffix - truncated_base = self.truncate_filename(base, max_length - len(suffix) - len(ext)) + truncated_base = self.truncate_filename( + base, max_length - len(suffix) - len(ext) + ) unique_path = f"{truncated_base}{suffix}{ext}" counter += 1 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index f32022a..261376e 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -96,7 +96,7 @@ DOCX_IMAGES_TEST_STRINGS = [ "# Abstract", "# Introduction", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", - "GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png" + "GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png", ] PPTX_TEST_STRINGS = [