From e687320b8a379277dc8c7801234994ae498a54f2 Mon Sep 17 00:00:00 2001 From: AnupamKumar-1 Date: Thu, 1 May 2025 03:19:32 +0530 Subject: [PATCH] Add CSS selector flag to HTML converter - Introduce in CLI - Pass selector through to HtmlConverter - Scope BeautifulSoup parsing to selected nodes only - Raise ValueError on no matches --- .../markitdown/src/markitdown/__main__.py | 24 ++++-- .../markitdown/converters/_html_converter.py | 15 +++- .../markitdown/tests/test_html_selector.py | 76 +++++++++++++++++++ 3 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 packages/markitdown/tests/test_html_selector.py diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index cfb1c6e..2474558 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -42,8 +42,8 @@ def main(): OR markitdown example.pdf > example.md - """ - ).strip(), + """.strip(), + ), ) parser.add_argument( @@ -78,6 +78,15 @@ def main(): help="Provide a hint about the file's charset (e.g, UTF-8).", ) + # New CSS selector flag + parser.add_argument( + "-s", + "--selector", + metavar="CSS", + help="Only convert HTML nodes matching this CSS selector (e.g. 'article.main').", + default=None, + ) + parser.add_argument( "-d", "--use-docintel", @@ -187,22 +196,27 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) + # Pass selector through to conversion if args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris, + selector=args.selector, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, + stream_info=stream_info, + keep_data_uris=args.keep_data_uris, + selector=args.selector, ) _handle_output(args, result) + def _handle_output(args, result: DocumentConverterResult): + """Handle output to stdout or file""" -def _handle_output(args, result: DocumentConverterResult): - """Handle output to stdout or file""" if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index dabb0d7..007f8f8 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -48,13 +48,24 @@ class HtmlConverter(DocumentConverter): encoding = "utf-8" if stream_info.charset is None else stream_info.charset soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) + # ——— CSS selector scoping ——— + selector = kwargs.pop("selector", None) + if selector: + nodes = soup.select(selector) + if not nodes: + raise ValueError(f"No elements match selector: {selector}") + scoped = BeautifulSoup("", "html.parser") + for node in nodes: + scoped.append(node) + soup = scoped + # —————————————————————————— + # Remove javascript and style blocks for script in soup(["script", "style"]): script.extract() # Print only the main content body_elm = soup.find("body") - webpage_text = "" if body_elm: webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) else: @@ -71,7 +82,7 @@ class HtmlConverter(DocumentConverter): ) def convert_string( - self, html_content: str, *, url: Optional[str] = None, **kwargs + self, html_content: str, *, url: Optional[str] = None, **kwargs: Any ) -> DocumentConverterResult: """ Non-standard convenience method to convert a string to markdown. diff --git a/packages/markitdown/tests/test_html_selector.py b/packages/markitdown/tests/test_html_selector.py new file mode 100644 index 0000000..4027148 --- /dev/null +++ b/packages/markitdown/tests/test_html_selector.py @@ -0,0 +1,76 @@ +import io +import pytest +from markitdown.converters._html_converter import HtmlConverter +from markitdown._stream_info import StreamInfo + +# Sample HTML to test selector scoping +SAMPLE_HTML = """ + + +
Skip Me
+
+

Title

+

Body text.

+
+ + + +""" + + +def test_selector_extracts_only_matching_nodes(): + converter = HtmlConverter() + # Use the convenience method to convert a string with selector + result = converter.convert_string(SAMPLE_HTML, selector="article.entry") + md = result.markdown + # Print the markdown for inspection + print("\n--- Extracted Markdown (test_selector_extracts_only_matching_nodes) ---\n") + print(md) + # Only the article content should appear + assert "Title" in md + assert "Body text." in md + assert "Skip Me" not in md + assert "Also Skip" not in md + + +def test_selector_no_match_raises(): + converter = HtmlConverter() + # Non-existing selector should raise a ValueError + with pytest.raises(ValueError): + converter.convert_string(SAMPLE_HTML, selector=".does-not-exist") + + +def test_no_selector_returns_full_content(): + converter = HtmlConverter() + # Without selector, header and footer should remain + result = converter.convert_string(SAMPLE_HTML) + md = result.markdown + # Print the markdown for inspection + print("\n--- Extracted Markdown (test_no_selector_returns_full_content) ---\n") + print(md) + assert "Skip Me" in md + assert "Title" in md + assert "Body text." in md + assert "Also Skip" in md + + +def test_convert_method_with_stream_and_selector(): + converter = HtmlConverter() + html_bytes = SAMPLE_HTML.encode("utf-8") + stream = io.BytesIO(html_bytes) + stream_info = StreamInfo( + mimetype="text/html", + extension=".html", + charset="utf-8", + url=None, + ) + # Directly call convert(), passing selector + result = converter.convert(stream, stream_info, selector="article.entry") + md = result.markdown + # Print the markdown for inspection + print("\n--- Extracted Markdown (test_convert_method_with_stream_and_selector) ---\n") + print(md) + assert "Title" in md + assert "Body text." in md + assert "Skip Me" not in md + assert "Also Skip" not in md