Update _markitdown.py to include images in markdown
This commit is contained in:
parent
125e206047
commit
46d0829f78
1 changed files with 39 additions and 9 deletions
|
|
@ -693,24 +693,54 @@ class PdfConverter(DocumentConverter):
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
"""
|
"""
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def sanitize_filename(self, name: str) -> str:
|
||||||
|
"""Sanitizes a string to make it a valid file name."""
|
||||||
|
# Normalize whitespace
|
||||||
|
name = re.sub(r'\s+', " ", name.strip())
|
||||||
|
# Replace invalid characters with underscores
|
||||||
|
return re.sub(r'[\\/*?:"<>|]', "_", name)
|
||||||
|
|
||||||
|
def convert_image(self, image, output_dir: str) -> dict:
|
||||||
|
"""Handles image extraction and saving."""
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
raw_name = image.alt_text or f"image_{hash(image)}"
|
||||||
|
image_name = self.sanitize_filename(raw_name) + ".png"
|
||||||
|
image_path = os.path.join(output_dir, image_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with image.open() as image_bytes:
|
||||||
|
with open(image_path, "wb") as img_file:
|
||||||
|
img_file.write(image_bytes.read())
|
||||||
|
return {"src": image_path}
|
||||||
|
except Exception:
|
||||||
|
# Return an empty src if saving fails
|
||||||
|
return {"src": ""}
|
||||||
|
|
||||||
|
def convert(self, local_path: str, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a DOCX
|
# Bail if not a DOCX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() != ".docx":
|
if extension.lower() != ".docx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
result = None
|
try:
|
||||||
with open(local_path, "rb") as docx_file:
|
with open(local_path, "rb") as docx_file:
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map")
|
||||||
|
image_output_dir = kwargs.get("image_output_dir", "images")
|
||||||
|
|
||||||
result = mammoth.convert_to_html(docx_file, style_map=style_map)
|
mammoth_result = convert_to_html(
|
||||||
html_content = result.value
|
docx_file,
|
||||||
result = self._convert(html_content)
|
style_map=style_map,
|
||||||
|
convert_image=images.inline(lambda img: self.convert_image(img, image_output_dir)),
|
||||||
|
)
|
||||||
|
|
||||||
return result
|
html_content = mammoth_result.value
|
||||||
|
return self._convert(html_content)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
class XlsxConverter(HtmlConverter):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue