add other converter para support

This commit is contained in:
Yuzhong Zhang 2025-03-18 20:14:46 +08:00
parent 9f1bcf3b83
commit 41cd9b5e2a
4 changed files with 10 additions and 6 deletions

View file

@ -79,7 +79,7 @@ class BingSerpConverter(DocumentConverter):
slug.extract()
# Parse the algorithmic results
_markdownify = _CustomMarkdownify()
_markdownify = _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False))
results = list()
for result in soup.find_all(class_="b_algo"):
if not hasattr(result, "find_all"):

View file

@ -72,8 +72,7 @@ class DocxConverter(HtmlConverter):
)
style_map = kwargs.get("style_map", None)
keep_data_uris = kwargs.get("keep_data_uris", False)
return self._html_converter.convert_string(
mammoth.convert_to_html(file_stream, style_map=style_map).value,
keep_data_uris=keep_data_uris
keep_data_uris=kwargs.get("keep_data_uris", False)
)

View file

@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def __init__(self):
super().__init__()
self._kwargs = {}
def accepts(
self,
file_stream: BinaryIO,
@ -82,6 +86,7 @@ class RssConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
self._kwargs = kwargs
doc = minidom.parse(file_stream)
feed_type = self._feed_type(doc)
@ -166,7 +171,7 @@ class RssConverter(DocumentConverter):
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup)
return _CustomMarkdownify(keep_data_uris=self._kwargs.get("keep_data_uris", False)).convert_soup(soup)
except BaseException as _:
return content

View file

@ -76,11 +76,11 @@ class WikipediaConverter(DocumentConverter):
main_title = title_elm.string
# Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)).convert_soup(
body_elm
)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
webpage_text = _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)).convert_soup(soup)
return DocumentConverterResult(
markdown=webpage_text,