Underscores, length limit, unique name, tests

This commit is contained in:
Mauro Druwel 2025-01-04 12:53:02 +01:00
parent 0a9e1f4d75
commit 3a6f023f0b
3 changed files with 59 additions and 7 deletions

View file

@ -702,19 +702,57 @@ class DocxConverter(HtmlConverter):
""" """
def sanitize_filename(self, name: str) -> str: def sanitize_filename(self, name: str) -> str:
"""Sanitizes a string to make it a valid file name.""" """Sanitizes a string to make it a valid file name across different operating systems."""
# Normalize whitespace # Normalize underscore
name = re.sub(r"\s+", " ", name.strip()) name = re.sub(r"\s+", "_", name.strip())
# Replace invalid characters with underscores # Replace invalid characters with underscores
return re.sub(r'[\\/*?:"<>|]', "_", name) name = re.sub(r'[\\/*?:"<>|]', "_", name)
# Remove leading and trailing dots and spaces
name = name.strip(" .")
# Limit the length of the filename to a reasonable length (e.g., 251 characters)
max_length = 251
if len(name) > max_length:
name = name[:max_length]
return name
def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str:
"""Truncates the filename to ensure the final length is within the limit."""
max_base_length = max_length - len(extension)
if len(name) > max_base_length:
return name[:max_base_length]
return name
def unique_filename(self, base_path: str, max_length: int = 251) -> str:
"""Generates a unique filename while ensuring it stays within the length limit."""
base, ext = os.path.splitext(base_path)
truncated_base = self.truncate_filename(base, max_length, ext)
counter = 1
unique_path = f"{truncated_base}{ext}"
while os.path.exists(unique_path):
suffix = f"_{counter}"
# Ensure base is short enough to add the suffix
truncated_base = self.truncate_filename(base, max_length - len(suffix) - len(ext))
unique_path = f"{truncated_base}{suffix}{ext}"
counter += 1
return unique_path
def convert_image(self, image, output_dir: str) -> dict: def convert_image(self, image, output_dir: str) -> dict:
"""Handles image extraction and saving.""" """Handles image extraction and saving with collision avoidance and length limits."""
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
raw_name = image.alt_text or f"image_{hash(image)}" raw_name = image.alt_text or f"image_{hash(image)}"
image_name = self.sanitize_filename(raw_name) + ".png" sanitized_name = self.sanitize_filename(raw_name)
image_path = os.path.join(output_dir, image_name) truncated_name = self.truncate_filename(sanitized_name, 251, ".png")
image_path = os.path.join(output_dir, truncated_name + ".png")
# Ensure unique filename
image_path = self.unique_filename(image_path)
try: try:
with image.open() as image_bytes: with image.open() as image_bytes:

BIN
tests/test_files/test_with_images.docx vendored Normal file

Binary file not shown.

View file

@ -89,6 +89,16 @@ DOCX_COMMENT_TEST_STRINGS = [
"Yet another comment in the doc. 55yiyi-asd09", "Yet another comment in the doc. 55yiyi-asd09",
] ]
DOCX_IMAGES_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png"
]
PPTX_TEST_STRINGS = [ PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455", "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
@ -206,6 +216,10 @@ def test_markitdown_local() -> None:
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
validate_strings(result, DOCX_TEST_STRINGS) validate_strings(result, DOCX_TEST_STRINGS)
# Test DOCX processing, with images
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_with_images.docx"))
validate_strings(result, DOCX_IMAGES_TEST_STRINGS)
# Test DOCX processing, with comments # Test DOCX processing, with comments
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),