optional reserve base64 string in markdown

_CustomMarkdownify and pptx
This commit is contained in:
Yuzhong Zhang 2025-03-18 20:01:35 +08:00
parent a93e0567e6
commit 9f1bcf3b83
4 changed files with 21 additions and 7 deletions

View file

@ -72,6 +72,8 @@ class DocxConverter(HtmlConverter):
) )
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)
keep_data_uris = kwargs.get("keep_data_uris", False)
return self._html_converter.convert_string( return self._html_converter.convert_string(
mammoth.convert_to_html(file_stream, style_map=style_map).value mammoth.convert_to_html(file_stream, style_map=style_map).value,
keep_data_uris=keep_data_uris
) )

View file

@ -55,10 +55,11 @@ class HtmlConverter(DocumentConverter):
# Print only the main content # Print only the main content
body_elm = soup.find("body") body_elm = soup.find("body")
webpage_text = "" webpage_text = ""
keep_data_uris = kwargs.get("keep_data_uris", False)
if body_elm: if body_elm:
webpage_text = _CustomMarkdownify().convert_soup(body_elm) webpage_text = _CustomMarkdownify(keep_data_uris=keep_data_uris).convert_soup(body_elm)
else: else:
webpage_text = _CustomMarkdownify().convert_soup(soup) webpage_text = _CustomMarkdownify(keep_data_uris=keep_data_uris).convert_soup(soup)
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)

View file

@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any): def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX) options["heading_style"] = options.get("heading_style", markdownify.ATX)
self.keep_data_uris = options.pop("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return alt return alt
# Remove dataURIs # Remove dataURIs
if src.startswith("data:"): if src.startswith("data:") and not self.keep_data_uris:
src = src.split(",")[0] + "..." src = src.split(",")[0] + "..."
return "![%s](%s%s)" % (alt, src, title_part) return "![%s](%s%s)" % (alt, src, title_part)

View file

@ -78,6 +78,9 @@ class PptxConverter(DocumentConverter):
_dependency_exc_info[2] _dependency_exc_info[2]
) )
# Get the keep_data_uris parameter
keep_data_uris = kwargs.get("keep_data_uris", False)
# Perform the conversion # Perform the conversion
presentation = pptx.Presentation(file_stream) presentation = pptx.Presentation(file_stream)
md_content = "" md_content = ""
@ -140,9 +143,16 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
alt_text = re.sub(r"\s+", " ", alt_text).strip() alt_text = re.sub(r"\s+", " ", alt_text).strip()
# A placeholder name # If keep_data_uris is True, use base64 encoding for images
filename = re.sub(r"\W", "", shape.name) + ".jpg" if keep_data_uris:
md_content += "\n![" + alt_text + "](" + filename + ")\n" blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
else:
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += "\n![" + alt_text + "](" + filename + ")\n"
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):