add options to keep data uris
This commit is contained in:
parent
f58a864951
commit
42fb33a32e
1 changed files with 44 additions and 15 deletions
|
|
@ -72,6 +72,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
|
||||||
def __init__(self, **options: Any):
|
def __init__(self, **options: Any):
|
||||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||||
|
self.keep_data_uris = options.pop("keep_data_uris", False)
|
||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
|
||||||
|
|
@ -133,10 +134,10 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
return alt
|
return alt
|
||||||
|
|
||||||
# Remove dataURIs
|
# Remove dataURIs
|
||||||
if src.startswith("data:"):
|
if not self.keep_data_uris and src.startswith("data:"):
|
||||||
src = src.split(",")[0] + "..."
|
src = src.split(",")[0] + "..."
|
||||||
|
|
||||||
return "" % (alt, src, title_part)
|
return "" % (alt, title_part, src)
|
||||||
|
|
||||||
def convert_soup(self, soup: Any) -> str:
|
def convert_soup(self, soup: Any) -> str:
|
||||||
return super().convert_soup(soup) # type: ignore
|
return super().convert_soup(soup) # type: ignore
|
||||||
|
|
@ -189,6 +190,10 @@ class PlainTextConverter(DocumentConverter):
|
||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
"""Anything with content type text/html"""
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
|
def __init__(self, keep_data_uris: Optional[bool] = False):
|
||||||
|
self.keep_data_uris = keep_data_uris
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|
@ -217,9 +222,13 @@ class HtmlConverter(DocumentConverter):
|
||||||
body_elm = soup.find("body")
|
body_elm = soup.find("body")
|
||||||
webpage_text = ""
|
webpage_text = ""
|
||||||
if body_elm:
|
if body_elm:
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
webpage_text = _CustomMarkdownify(
|
||||||
|
keep_data_uris=self.keep_data_uris
|
||||||
|
).convert_soup(body_elm)
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
webpage_text = _CustomMarkdownify(
|
||||||
|
keep_data_uris=self.keep_data_uris
|
||||||
|
).convert_soup(soup)
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
|
|
@ -232,6 +241,10 @@ class HtmlConverter(DocumentConverter):
|
||||||
class RSSConverter(DocumentConverter):
|
class RSSConverter(DocumentConverter):
|
||||||
"""Convert RSS / Atom type to markdown"""
|
"""Convert RSS / Atom type to markdown"""
|
||||||
|
|
||||||
|
def __init__(self, keep_data_uris: Optional[bool] = False):
|
||||||
|
self.keep_data_uris = keep_data_uris
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs
|
self, local_path: str, **kwargs
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|
@ -347,7 +360,9 @@ class RSSConverter(DocumentConverter):
|
||||||
try:
|
try:
|
||||||
# using bs4 because many RSS feeds have HTML-styled content
|
# using bs4 because many RSS feeds have HTML-styled content
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
return _CustomMarkdownify().convert_soup(soup)
|
return _CustomMarkdownify(keep_data_uris=self.keep_data_uris).convert_soup(
|
||||||
|
soup
|
||||||
|
)
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
@ -369,6 +384,10 @@ class RSSConverter(DocumentConverter):
|
||||||
class WikipediaConverter(DocumentConverter):
|
class WikipediaConverter(DocumentConverter):
|
||||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||||
|
|
||||||
|
def __init__(self, keep_data_uris: Optional[bool] = False):
|
||||||
|
self.keep_data_uris = keep_data_uris
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|
@ -403,11 +422,13 @@ class WikipediaConverter(DocumentConverter):
|
||||||
assert isinstance(main_title, str)
|
assert isinstance(main_title, str)
|
||||||
|
|
||||||
# Convert the page
|
# Convert the page
|
||||||
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
|
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
|
||||||
body_elm
|
keep_data_uris=self.keep_data_uris
|
||||||
)
|
).convert_soup(body_elm)
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
webpage_text = _CustomMarkdownify(
|
||||||
|
keep_data_uris=self.keep_data_uris
|
||||||
|
).convert_soup(soup)
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=main_title,
|
title=main_title,
|
||||||
|
|
@ -609,6 +630,10 @@ class IpynbConverter(DocumentConverter):
|
||||||
|
|
||||||
|
|
||||||
class BingSerpConverter(DocumentConverter):
|
class BingSerpConverter(DocumentConverter):
|
||||||
|
def __init__(self, keep_data_uris: Optional[bool] = False):
|
||||||
|
self.keep_data_uris = keep_data_uris
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Handle Bing results pages (only the organic search results).
|
Handle Bing results pages (only the organic search results).
|
||||||
NOTE: It is better to use the Bing API
|
NOTE: It is better to use the Bing API
|
||||||
|
|
@ -640,7 +665,7 @@ class BingSerpConverter(DocumentConverter):
|
||||||
slug.extract()
|
slug.extract()
|
||||||
|
|
||||||
# Parse the algorithmic results
|
# Parse the algorithmic results
|
||||||
_markdownify = _CustomMarkdownify()
|
_markdownify = _CustomMarkdownify(keep_data_uris=self.keep_data_uris)
|
||||||
results = list()
|
results = list()
|
||||||
for result in soup.find_all(class_="b_algo"):
|
for result in soup.find_all(class_="b_algo"):
|
||||||
# Rewrite redirect urls
|
# Rewrite redirect urls
|
||||||
|
|
@ -701,6 +726,9 @@ class DocxConverter(HtmlConverter):
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, keep_data_uris: Optional[bool] = False):
|
||||||
|
super().__init__(keep_data_uris=keep_data_uris)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a DOCX
|
# Bail if not a DOCX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|
@ -1337,6 +1365,7 @@ class MarkItDown:
|
||||||
llm_model: Optional[str] = None,
|
llm_model: Optional[str] = None,
|
||||||
style_map: Optional[str] = None,
|
style_map: Optional[str] = None,
|
||||||
exiftool_path: Optional[str] = None,
|
exiftool_path: Optional[str] = None,
|
||||||
|
keep_data_uris: Optional[bool] = False,
|
||||||
# Deprecated
|
# Deprecated
|
||||||
mlm_client: Optional[Any] = None,
|
mlm_client: Optional[Any] = None,
|
||||||
mlm_model: Optional[str] = None,
|
mlm_model: Optional[str] = None,
|
||||||
|
|
@ -1389,12 +1418,12 @@ class MarkItDown:
|
||||||
# Later registrations are tried first / take higher priority than earlier registrations
|
# Later registrations are tried first / take higher priority than earlier registrations
|
||||||
# To this end, the most specific converters should appear below the most generic converters
|
# To this end, the most specific converters should appear below the most generic converters
|
||||||
self.register_page_converter(PlainTextConverter())
|
self.register_page_converter(PlainTextConverter())
|
||||||
self.register_page_converter(HtmlConverter())
|
self.register_page_converter(HtmlConverter(keep_data_uris=keep_data_uris))
|
||||||
self.register_page_converter(RSSConverter())
|
self.register_page_converter(RSSConverter(keep_data_uris=keep_data_uris))
|
||||||
self.register_page_converter(WikipediaConverter())
|
self.register_page_converter(WikipediaConverter(keep_data_uris=keep_data_uris))
|
||||||
self.register_page_converter(YouTubeConverter())
|
self.register_page_converter(YouTubeConverter())
|
||||||
self.register_page_converter(BingSerpConverter())
|
self.register_page_converter(BingSerpConverter(keep_data_uris=keep_data_uris))
|
||||||
self.register_page_converter(DocxConverter())
|
self.register_page_converter(DocxConverter(keep_data_uris=keep_data_uris))
|
||||||
self.register_page_converter(XlsxConverter())
|
self.register_page_converter(XlsxConverter())
|
||||||
self.register_page_converter(XlsConverter())
|
self.register_page_converter(XlsConverter())
|
||||||
self.register_page_converter(PptxConverter())
|
self.register_page_converter(PptxConverter())
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue