diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index c568acb..daba123 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -72,6 +72,8 @@ class DocxConverter(HtmlConverter): ) style_map = kwargs.get("style_map", None) + keep_data_uris = kwargs.get("keep_data_uris", False) return self._html_converter.convert_string( - mammoth.convert_to_html(file_stream, style_map=style_map).value + mammoth.convert_to_html(file_stream, style_map=style_map).value, + keep_data_uris=keep_data_uris ) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 8a8203d..7195772 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -55,10 +55,11 @@ class HtmlConverter(DocumentConverter): # Print only the main content body_elm = soup.find("body") webpage_text = "" + keep_data_uris = kwargs.get("keep_data_uris", False) if body_elm: - webpage_text = _CustomMarkdownify().convert_soup(body_elm) + webpage_text = _CustomMarkdownify(keep_data_uris=keep_data_uris).convert_soup(body_elm) else: - webpage_text = _CustomMarkdownify().convert_soup(soup) + webpage_text = _CustomMarkdownify(keep_data_uris=keep_data_uris).convert_soup(soup) assert isinstance(webpage_text, str) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index ae99c0b..d98e1a3 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) + self.keep_data_uris = options.pop("keep_data_uris", False) # Explicitly cast options to the expected type if necessary super().__init__(**options) @@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): return alt # Remove dataURIs - if src.startswith("data:"): + if src.startswith("data:") and not self.keep_data_uris: src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index e855382..a45a507 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -78,6 +78,9 @@ class PptxConverter(DocumentConverter): _dependency_exc_info[2] ) + # Get the keep_data_uris parameter + keep_data_uris = kwargs.get("keep_data_uris", False) + # Perform the conversion presentation = pptx.Presentation(file_stream) md_content = "" @@ -140,9 +143,16 @@ class PptxConverter(DocumentConverter): alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"\s+", " ", alt_text).strip() - # A placeholder name - filename = re.sub(r"\W", "", shape.name) + ".jpg" - md_content += "\n![" + alt_text + "](" + filename + ")\n" + # If keep_data_uris is True, use base64 encoding for images + if keep_data_uris: + blob = shape.image.blob + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + else: + # A placeholder name + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + "](" + filename + ")\n" # Tables if self._is_table(shape):