add options to keep data uris

This commit is contained in:
VoidIsVoid 2025-01-09 18:40:50 +08:00 committed by GitHub
parent f58a864951
commit 42fb33a32e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -72,6 +72,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any): def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX) options["heading_style"] = options.get("heading_style", markdownify.ATX)
self.keep_data_uris = options.pop("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
@ -133,10 +134,10 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return alt return alt
# Remove dataURIs # Remove dataURIs
if src.startswith("data:"): if not self.keep_data_uris and src.startswith("data:"):
src = src.split(",")[0] + "..." src = src.split(",")[0] + "..."
return "![%s](%s%s)" % (alt, src, title_part) return "![%s%s](%s)" % (alt, title_part, src)
def convert_soup(self, soup: Any) -> str: def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore return super().convert_soup(soup) # type: ignore
@ -189,6 +190,10 @@ class PlainTextConverter(DocumentConverter):
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
"""Anything with content type text/html""" """Anything with content type text/html"""
def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
@ -217,9 +222,13 @@ class HtmlConverter(DocumentConverter):
body_elm = soup.find("body") body_elm = soup.find("body")
webpage_text = "" webpage_text = ""
if body_elm: if body_elm:
webpage_text = _CustomMarkdownify().convert_soup(body_elm) webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(body_elm)
else: else:
webpage_text = _CustomMarkdownify().convert_soup(soup) webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(soup)
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)
@ -232,6 +241,10 @@ class HtmlConverter(DocumentConverter):
class RSSConverter(DocumentConverter): class RSSConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown""" """Convert RSS / Atom type to markdown"""
def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()
def convert( def convert(
self, local_path: str, **kwargs self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
@ -347,7 +360,9 @@ class RSSConverter(DocumentConverter):
try: try:
# using bs4 because many RSS feeds have HTML-styled content # using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup) return _CustomMarkdownify(keep_data_uris=self.keep_data_uris).convert_soup(
soup
)
except BaseException as _: except BaseException as _:
return content return content
@ -369,6 +384,10 @@ class RSSConverter(DocumentConverter):
class WikipediaConverter(DocumentConverter): class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content.""" """Handle Wikipedia pages separately, focusing only on the main document content."""
def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
@ -403,11 +422,13 @@ class WikipediaConverter(DocumentConverter):
assert isinstance(main_title, str) assert isinstance(main_title, str)
# Convert the page # Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup( webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
body_elm keep_data_uris=self.keep_data_uris
) ).convert_soup(body_elm)
else: else:
webpage_text = _CustomMarkdownify().convert_soup(soup) webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(soup)
return DocumentConverterResult( return DocumentConverterResult(
title=main_title, title=main_title,
@ -609,6 +630,10 @@ class IpynbConverter(DocumentConverter):
class BingSerpConverter(DocumentConverter): class BingSerpConverter(DocumentConverter):
def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()
""" """
Handle Bing results pages (only the organic search results). Handle Bing results pages (only the organic search results).
NOTE: It is better to use the Bing API NOTE: It is better to use the Bing API
@ -640,7 +665,7 @@ class BingSerpConverter(DocumentConverter):
slug.extract() slug.extract()
# Parse the algorithmic results # Parse the algorithmic results
_markdownify = _CustomMarkdownify() _markdownify = _CustomMarkdownify(keep_data_uris=self.keep_data_uris)
results = list() results = list()
for result in soup.find_all(class_="b_algo"): for result in soup.find_all(class_="b_algo"):
# Rewrite redirect urls # Rewrite redirect urls
@ -701,6 +726,9 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
""" """
def __init__(self, keep_data_uris: Optional[bool] = False):
super().__init__(keep_data_uris=keep_data_uris)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX # Bail if not a DOCX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@ -1337,6 +1365,7 @@ class MarkItDown:
llm_model: Optional[str] = None, llm_model: Optional[str] = None,
style_map: Optional[str] = None, style_map: Optional[str] = None,
exiftool_path: Optional[str] = None, exiftool_path: Optional[str] = None,
keep_data_uris: Optional[bool] = False,
# Deprecated # Deprecated
mlm_client: Optional[Any] = None, mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None, mlm_model: Optional[str] = None,
@ -1389,12 +1418,12 @@ class MarkItDown:
# Later registrations are tried first / take higher priority than earlier registrations # Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters # To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter()) self.register_page_converter(PlainTextConverter())
self.register_page_converter(HtmlConverter()) self.register_page_converter(HtmlConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(RSSConverter()) self.register_page_converter(RSSConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(WikipediaConverter()) self.register_page_converter(WikipediaConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(YouTubeConverter()) self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter()) self.register_page_converter(BingSerpConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(DocxConverter()) self.register_page_converter(DocxConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(XlsxConverter()) self.register_page_converter(XlsxConverter())
self.register_page_converter(XlsConverter()) self.register_page_converter(XlsConverter())
self.register_page_converter(PptxConverter()) self.register_page_converter(PptxConverter())