This commit is contained in:
Mauro Druwel 2025-02-09 17:59:34 +06:00 committed by GitHub
commit 98928e101a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 98 additions and 9 deletions

View file

@ -717,24 +717,99 @@ class PdfConverter(DocumentConverter):
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):
""" """
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def sanitize_filename(self, name: str) -> str:
"""Sanitizes a string to make it a valid file name across different operating systems."""
# Normalize underscore
name = re.sub(r"\s+", "_", name.strip())
# Replace invalid characters with underscores
name = re.sub(r'[\\/*?:"<>|]', "_", name)
# Remove leading and trailing dots and spaces
name = name.strip(" .")
# Limit the length of the filename to a reasonable length (e.g., 251 characters)
max_length = 251
if len(name) > max_length:
name = name[:max_length]
return name
def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str:
"""Truncates the filename to ensure the final length is within the limit."""
max_base_length = max_length - len(extension)
if len(name) > max_base_length:
return name[:max_base_length]
return name
def unique_filename(self, base_path: str, max_length: int = 251) -> str:
"""Generates a unique filename while ensuring it stays within the length limit."""
base, ext = os.path.splitext(base_path)
truncated_base = self.truncate_filename(base, max_length, ext)
counter = 1
unique_path = f"{truncated_base}{ext}"
while os.path.exists(unique_path):
suffix = f"_{counter}"
# Ensure base is short enough to add the suffix
truncated_base = self.truncate_filename(
base, max_length - len(suffix) - len(ext)
)
unique_path = f"{truncated_base}{suffix}{ext}"
counter += 1
return unique_path
def convert_image(self, image, output_dir: str) -> dict:
"""Handles image extraction and saving with collision avoidance and length limits."""
os.makedirs(output_dir, exist_ok=True)
image.alt_text = image.alt_text.replace("\n", " ")
raw_name = image.alt_text or f"image_{hash(image)}"
sanitized_name = self.sanitize_filename(raw_name)
truncated_name = self.truncate_filename(sanitized_name, 251, ".png")
image_path = os.path.join(output_dir, truncated_name + ".png")
# Ensure unique filename
image_path = self.unique_filename(image_path)
try:
with image.open() as image_bytes:
with open(image_path, "wb") as img_file:
img_file.write(image_bytes.read())
return {"src": image_path, "alt": image.alt_text}
except Exception:
# Return an empty src if saving fails
return {"src": ""}
def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX # Bail if not a DOCX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx": if extension.lower() != ".docx":
return None return None
result = None try:
with open(local_path, "rb") as docx_file: with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map")
image_output_dir = kwargs.get("image_output_dir", "images")
result = mammoth.convert_to_html(docx_file, style_map=style_map) mammoth_result = mammoth.convert_to_html(
html_content = result.value docx_file,
result = self._convert(html_content) style_map=style_map,
convert_image=mammoth.images.inline(
lambda img: self.convert_image(img, image_output_dir)
),
)
return result html_content = mammoth_result.value
return self._convert(html_content)
except Exception:
return None
class XlsxConverter(HtmlConverter): class XlsxConverter(HtmlConverter):

BIN
tests/test_files/test_with_images.docx vendored Normal file

Binary file not shown.

View file

@ -89,6 +89,16 @@ DOCX_COMMENT_TEST_STRINGS = [
"Yet another comment in the doc. 55yiyi-asd09", "Yet another comment in the doc. 55yiyi-asd09",
] ]
DOCX_IMAGES_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png",
]
PPTX_TEST_STRINGS = [ PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455", "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
@ -206,6 +216,10 @@ def test_markitdown_local() -> None:
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
validate_strings(result, DOCX_TEST_STRINGS) validate_strings(result, DOCX_TEST_STRINGS)
# Test DOCX processing, with images
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_with_images.docx"))
validate_strings(result, DOCX_IMAGES_TEST_STRINGS)
# Test DOCX processing, with comments # Test DOCX processing, with comments
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),