optional reserve base64 string in markdown
_CustomMarkdownify and pptx
This commit is contained in:
parent
a93e0567e6
commit
9f1bcf3b83
4 changed files with 21 additions and 7 deletions
|
|
@ -72,6 +72,8 @@ class DocxConverter(HtmlConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
|
keep_data_uris = kwargs.get("keep_data_uris", False)
|
||||||
return self._html_converter.convert_string(
|
return self._html_converter.convert_string(
|
||||||
mammoth.convert_to_html(file_stream, style_map=style_map).value
|
mammoth.convert_to_html(file_stream, style_map=style_map).value,
|
||||||
|
keep_data_uris=keep_data_uris
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -55,10 +55,11 @@ class HtmlConverter(DocumentConverter):
|
||||||
# Print only the main content
|
# Print only the main content
|
||||||
body_elm = soup.find("body")
|
body_elm = soup.find("body")
|
||||||
webpage_text = ""
|
webpage_text = ""
|
||||||
|
keep_data_uris = kwargs.get("keep_data_uris", False)
|
||||||
if body_elm:
|
if body_elm:
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
webpage_text = _CustomMarkdownify(keep_data_uris=keep_data_uris).convert_soup(body_elm)
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
webpage_text = _CustomMarkdownify(keep_data_uris=keep_data_uris).convert_soup(soup)
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
|
||||||
def __init__(self, **options: Any):
|
def __init__(self, **options: Any):
|
||||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||||
|
self.keep_data_uris = options.pop("keep_data_uris", False)
|
||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
|
|
@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
return alt
|
return alt
|
||||||
|
|
||||||
# Remove dataURIs
|
# Remove dataURIs
|
||||||
if src.startswith("data:"):
|
if src.startswith("data:") and not self.keep_data_uris:
|
||||||
src = src.split(",")[0] + "..."
|
src = src.split(",")[0] + "..."
|
||||||
|
|
||||||
return "" % (alt, src, title_part)
|
return "" % (alt, src, title_part)
|
||||||
|
|
|
||||||
|
|
@ -78,6 +78,9 @@ class PptxConverter(DocumentConverter):
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Get the keep_data_uris parameter
|
||||||
|
keep_data_uris = kwargs.get("keep_data_uris", False)
|
||||||
|
|
||||||
# Perform the conversion
|
# Perform the conversion
|
||||||
presentation = pptx.Presentation(file_stream)
|
presentation = pptx.Presentation(file_stream)
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
@ -140,9 +143,16 @@ class PptxConverter(DocumentConverter):
|
||||||
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
|
||||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||||
|
|
||||||
# A placeholder name
|
# If keep_data_uris is True, use base64 encoding for images
|
||||||
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
if keep_data_uris:
|
||||||
md_content += "\n\n"
|
blob = shape.image.blob
|
||||||
|
content_type = shape.image.content_type or "image/png"
|
||||||
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
|
md_content += f"\n\n"
|
||||||
|
else:
|
||||||
|
# A placeholder name
|
||||||
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
|
md_content += "\n\n"
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue