diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 3527d28..284170e 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -79,7 +79,7 @@ class BingSerpConverter(DocumentConverter): slug.extract() # Parse the algorithmic results - _markdownify = _CustomMarkdownify() + _markdownify = _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)) results = list() for result in soup.find_all(class_="b_algo"): if not hasattr(result, "find_all"): diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index daba123..07ca62a 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -72,8 +72,7 @@ class DocxConverter(HtmlConverter): ) style_map = kwargs.get("style_map", None) - keep_data_uris = kwargs.get("keep_data_uris", False) return self._html_converter.convert_string( mammoth.convert_to_html(file_stream, style_map=style_map).value, - keep_data_uris=keep_data_uris + keep_data_uris=kwargs.get("keep_data_uris", False) ) diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 7c80d01..c77e84b 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [ class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" + def __init__(self): + super().__init__() + self._kwargs = {} + def accepts( self, file_stream: BinaryIO, @@ -82,6 +86,7 @@ class RssConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + self._kwargs = kwargs doc = minidom.parse(file_stream) feed_type = self._feed_type(doc) @@ -166,7 +171,7 @@ class RssConverter(DocumentConverter): try: # using bs4 because many RSS feeds have HTML-styled content soup = BeautifulSoup(content, "html.parser") - return _CustomMarkdownify().convert_soup(soup) + return _CustomMarkdownify(keep_data_uris=self._kwargs.get("keep_data_uris", False)).convert_soup(soup) except BaseException as _: return content diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 39466c0..fa1dd37 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -76,11 +76,11 @@ class WikipediaConverter(DocumentConverter): main_title = title_elm.string # Convert the page - webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup( + webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)).convert_soup( body_elm ) else: - webpage_text = _CustomMarkdownify().convert_soup(soup) + webpage_text = _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)).convert_soup(soup) return DocumentConverterResult( markdown=webpage_text,