From 41cd9b5e2a5e9fc6d11e61419d73e37914c8832b Mon Sep 17 00:00:00 2001 From: Yuzhong Zhang <141388234+BetterAndBetterII@users.noreply.github.com> Date: Tue, 18 Mar 2025 20:14:46 +0800 Subject: [PATCH] add other converter para support --- .../src/markitdown/converters/_bing_serp_converter.py | 2 +- .../src/markitdown/converters/_docx_converter.py | 3 +-- .../markitdown/src/markitdown/converters/_rss_converter.py | 7 ++++++- .../src/markitdown/converters/_wikipedia_converter.py | 4 ++-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 3527d28..284170e 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -79,7 +79,7 @@ class BingSerpConverter(DocumentConverter): slug.extract() # Parse the algorithmic results - _markdownify = _CustomMarkdownify() + _markdownify = _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)) results = list() for result in soup.find_all(class_="b_algo"): if not hasattr(result, "find_all"): diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index daba123..07ca62a 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -72,8 +72,7 @@ class DocxConverter(HtmlConverter): ) style_map = kwargs.get("style_map", None) - keep_data_uris = kwargs.get("keep_data_uris", False) return self._html_converter.convert_string( mammoth.convert_to_html(file_stream, style_map=style_map).value, - keep_data_uris=keep_data_uris + keep_data_uris=kwargs.get("keep_data_uris", False) ) diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 7c80d01..c77e84b 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [ class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" + def __init__(self): + super().__init__() + self._kwargs = {} + def accepts( self, file_stream: BinaryIO, @@ -82,6 +86,7 @@ class RssConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + self._kwargs = kwargs doc = minidom.parse(file_stream) feed_type = self._feed_type(doc) @@ -166,7 +171,7 @@ class RssConverter(DocumentConverter): try: # using bs4 because many RSS feeds have HTML-styled content soup = BeautifulSoup(content, "html.parser") - return _CustomMarkdownify().convert_soup(soup) + return _CustomMarkdownify(keep_data_uris=self._kwargs.get("keep_data_uris", False)).convert_soup(soup) except BaseException as _: return content diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 39466c0..fa1dd37 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -76,11 +76,11 @@ class WikipediaConverter(DocumentConverter): main_title = title_elm.string # Convert the page - webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup( + webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)).convert_soup( body_elm ) else: - webpage_text = _CustomMarkdownify().convert_soup(soup) + webpage_text = _CustomMarkdownify(keep_data_uris=kwargs.get("keep_data_uris", False)).convert_soup(soup) return DocumentConverterResult( markdown=webpage_text,