Update _markitdown.py to include images in markdown

This commit is contained in:
Mauro Druwel 2024-12-23 15:46:31 +01:00
parent 125e206047
commit 46d0829f78

View file

@ -693,24 +693,54 @@ class PdfConverter(DocumentConverter):
class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def sanitize_filename(self, name: str) -> str:
"""Sanitizes a string to make it a valid file name."""
# Normalize whitespace
name = re.sub(r'\s+', " ", name.strip())
# Replace invalid characters with underscores
return re.sub(r'[\\/*?:"<>|]', "_", name)
def convert_image(self, image, output_dir: str) -> dict:
"""Handles image extraction and saving."""
os.makedirs(output_dir, exist_ok=True)
raw_name = image.alt_text or f"image_{hash(image)}"
image_name = self.sanitize_filename(raw_name) + ".png"
image_path = os.path.join(output_dir, image_name)
try:
with image.open() as image_bytes:
with open(image_path, "wb") as img_file:
img_file.write(image_bytes.read())
return {"src": image_path}
except Exception:
# Return an empty src if saving fails
return {"src": ""}
def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx":
return None
result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)
try:
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map")
image_output_dir = kwargs.get("image_output_dir", "images")
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)
mammoth_result = convert_to_html(
docx_file,
style_map=style_map,
convert_image=images.inline(lambda img: self.convert_image(img, image_output_dir)),
)
return result
html_content = mammoth_result.value
return self._convert(html_content)
except Exception:
return None
class XlsxConverter(HtmlConverter):