Add CSS selector flag to HTML converter

- Introduce  in CLI
- Pass selector through to HtmlConverter
- Scope BeautifulSoup parsing to selected nodes only
- Raise ValueError on no matches
This commit is contained in:
AnupamKumar-1 2025-05-01 03:19:32 +05:30
parent 041be54471
commit e687320b8a
3 changed files with 108 additions and 7 deletions

View file

@ -42,8 +42,8 @@ def main():
OR
markitdown example.pdf > example.md
"""
).strip(),
""".strip(),
),
)
parser.add_argument(
@ -78,6 +78,15 @@ def main():
help="Provide a hint about the file's charset (e.g, UTF-8).",
)
# New CSS selector flag
parser.add_argument(
"-s",
"--selector",
metavar="CSS",
help="Only convert HTML nodes matching this CSS selector (e.g. 'article.main').",
default=None,
)
parser.add_argument(
"-d",
"--use-docintel",
@ -187,22 +196,27 @@ def main():
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)
# Pass selector through to conversion
if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
selector=args.selector,
)
else:
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
args.filename,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
selector=args.selector,
)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.markdown)

View file

@ -48,13 +48,24 @@ class HtmlConverter(DocumentConverter):
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# ——— CSS selector scoping ———
selector = kwargs.pop("selector", None)
if selector:
nodes = soup.select(selector)
if not nodes:
raise ValueError(f"No elements match selector: {selector}")
scoped = BeautifulSoup("", "html.parser")
for node in nodes:
scoped.append(node)
soup = scoped
# ——————————————————————————
# Remove javascript and style blocks
for script in soup(["script", "style"]):
script.extract()
# Print only the main content
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else:
@ -71,7 +82,7 @@ class HtmlConverter(DocumentConverter):
)
def convert_string(
self, html_content: str, *, url: Optional[str] = None, **kwargs
self, html_content: str, *, url: Optional[str] = None, **kwargs: Any
) -> DocumentConverterResult:
"""
Non-standard convenience method to convert a string to markdown.

View file

@ -0,0 +1,76 @@
import io
import pytest
from markitdown.converters._html_converter import HtmlConverter
from markitdown._stream_info import StreamInfo
# Sample HTML to test selector scoping
SAMPLE_HTML = """
<html>
<body>
<header>Skip Me</header>
<article class="entry">
<h1>Title</h1>
<p>Body text.</p>
</article>
<footer>Also Skip</footer>
</body>
</html>
"""
def test_selector_extracts_only_matching_nodes():
converter = HtmlConverter()
# Use the convenience method to convert a string with selector
result = converter.convert_string(SAMPLE_HTML, selector="article.entry")
md = result.markdown
# Print the markdown for inspection
print("\n--- Extracted Markdown (test_selector_extracts_only_matching_nodes) ---\n")
print(md)
# Only the article content should appear
assert "Title" in md
assert "Body text." in md
assert "Skip Me" not in md
assert "Also Skip" not in md
def test_selector_no_match_raises():
converter = HtmlConverter()
# Non-existing selector should raise a ValueError
with pytest.raises(ValueError):
converter.convert_string(SAMPLE_HTML, selector=".does-not-exist")
def test_no_selector_returns_full_content():
converter = HtmlConverter()
# Without selector, header and footer should remain
result = converter.convert_string(SAMPLE_HTML)
md = result.markdown
# Print the markdown for inspection
print("\n--- Extracted Markdown (test_no_selector_returns_full_content) ---\n")
print(md)
assert "Skip Me" in md
assert "Title" in md
assert "Body text." in md
assert "Also Skip" in md
def test_convert_method_with_stream_and_selector():
converter = HtmlConverter()
html_bytes = SAMPLE_HTML.encode("utf-8")
stream = io.BytesIO(html_bytes)
stream_info = StreamInfo(
mimetype="text/html",
extension=".html",
charset="utf-8",
url=None,
)
# Directly call convert(), passing selector
result = converter.convert(stream, stream_info, selector="article.entry")
md = result.markdown
# Print the markdown for inspection
print("\n--- Extracted Markdown (test_convert_method_with_stream_and_selector) ---\n")
print(md)
assert "Title" in md
assert "Body text." in md
assert "Skip Me" not in md
assert "Also Skip" not in md