Pre-commit

This commit is contained in:
Mauro Druwel 2025-01-04 12:54:03 +01:00
parent 3a6f023f0b
commit 1b0d1491be
2 changed files with 8 additions and 6 deletions

View file

@ -705,18 +705,18 @@ class DocxConverter(HtmlConverter):
"""Sanitizes a string to make it a valid file name across different operating systems.""" """Sanitizes a string to make it a valid file name across different operating systems."""
# Normalize underscore # Normalize underscore
name = re.sub(r"\s+", "_", name.strip()) name = re.sub(r"\s+", "_", name.strip())
# Replace invalid characters with underscores # Replace invalid characters with underscores
name = re.sub(r'[\\/*?:"<>|]', "_", name) name = re.sub(r'[\\/*?:"<>|]', "_", name)
# Remove leading and trailing dots and spaces # Remove leading and trailing dots and spaces
name = name.strip(" .") name = name.strip(" .")
# Limit the length of the filename to a reasonable length (e.g., 251 characters) # Limit the length of the filename to a reasonable length (e.g., 251 characters)
max_length = 251 max_length = 251
if len(name) > max_length: if len(name) > max_length:
name = name[:max_length] name = name[:max_length]
return name return name
def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str: def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str:
@ -736,7 +736,9 @@ class DocxConverter(HtmlConverter):
while os.path.exists(unique_path): while os.path.exists(unique_path):
suffix = f"_{counter}" suffix = f"_{counter}"
# Ensure base is short enough to add the suffix # Ensure base is short enough to add the suffix
truncated_base = self.truncate_filename(base, max_length - len(suffix) - len(ext)) truncated_base = self.truncate_filename(
base, max_length - len(suffix) - len(ext)
)
unique_path = f"{truncated_base}{suffix}{ext}" unique_path = f"{truncated_base}{suffix}{ext}"
counter += 1 counter += 1

View file

@ -96,7 +96,7 @@ DOCX_IMAGES_TEST_STRINGS = [
"# Abstract", "# Abstract",
"# Introduction", "# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png" "GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png",
] ]
PPTX_TEST_STRINGS = [ PPTX_TEST_STRINGS = [