Add CSS selector flag to HTML converter
- Introduce in CLI - Pass selector through to HtmlConverter - Scope BeautifulSoup parsing to selected nodes only - Raise ValueError on no matches
This commit is contained in:
parent
041be54471
commit
e687320b8a
3 changed files with 108 additions and 7 deletions
|
|
@ -42,8 +42,8 @@ def main():
|
||||||
OR
|
OR
|
||||||
|
|
||||||
markitdown example.pdf > example.md
|
markitdown example.pdf > example.md
|
||||||
"""
|
""".strip(),
|
||||||
).strip(),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
@ -78,6 +78,15 @@ def main():
|
||||||
help="Provide a hint about the file's charset (e.g, UTF-8).",
|
help="Provide a hint about the file's charset (e.g, UTF-8).",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# New CSS selector flag
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--selector",
|
||||||
|
metavar="CSS",
|
||||||
|
help="Only convert HTML nodes matching this CSS selector (e.g. 'article.main').",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d",
|
"-d",
|
||||||
"--use-docintel",
|
"--use-docintel",
|
||||||
|
|
@ -187,22 +196,27 @@ def main():
|
||||||
else:
|
else:
|
||||||
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
||||||
|
|
||||||
|
# Pass selector through to conversion
|
||||||
if args.filename is None:
|
if args.filename is None:
|
||||||
result = markitdown.convert_stream(
|
result = markitdown.convert_stream(
|
||||||
sys.stdin.buffer,
|
sys.stdin.buffer,
|
||||||
stream_info=stream_info,
|
stream_info=stream_info,
|
||||||
keep_data_uris=args.keep_data_uris,
|
keep_data_uris=args.keep_data_uris,
|
||||||
|
selector=args.selector,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
|
args.filename,
|
||||||
|
stream_info=stream_info,
|
||||||
|
keep_data_uris=args.keep_data_uris,
|
||||||
|
selector=args.selector,
|
||||||
)
|
)
|
||||||
|
|
||||||
_handle_output(args, result)
|
_handle_output(args, result)
|
||||||
|
|
||||||
|
def _handle_output(args, result: DocumentConverterResult):
|
||||||
def _handle_output(args, result: DocumentConverterResult):
|
|
||||||
"""Handle output to stdout or file"""
|
"""Handle output to stdout or file"""
|
||||||
|
|
||||||
if args.output:
|
if args.output:
|
||||||
with open(args.output, "w", encoding="utf-8") as f:
|
with open(args.output, "w", encoding="utf-8") as f:
|
||||||
f.write(result.markdown)
|
f.write(result.markdown)
|
||||||
|
|
|
||||||
|
|
@ -48,13 +48,24 @@ class HtmlConverter(DocumentConverter):
|
||||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||||
|
|
||||||
|
# ——— CSS selector scoping ———
|
||||||
|
selector = kwargs.pop("selector", None)
|
||||||
|
if selector:
|
||||||
|
nodes = soup.select(selector)
|
||||||
|
if not nodes:
|
||||||
|
raise ValueError(f"No elements match selector: {selector}")
|
||||||
|
scoped = BeautifulSoup("", "html.parser")
|
||||||
|
for node in nodes:
|
||||||
|
scoped.append(node)
|
||||||
|
soup = scoped
|
||||||
|
# ——————————————————————————
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
|
||||||
# Print only the main content
|
# Print only the main content
|
||||||
body_elm = soup.find("body")
|
body_elm = soup.find("body")
|
||||||
webpage_text = ""
|
|
||||||
if body_elm:
|
if body_elm:
|
||||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
||||||
else:
|
else:
|
||||||
|
|
@ -71,7 +82,7 @@ class HtmlConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
def convert_string(
|
def convert_string(
|
||||||
self, html_content: str, *, url: Optional[str] = None, **kwargs
|
self, html_content: str, *, url: Optional[str] = None, **kwargs: Any
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
"""
|
"""
|
||||||
Non-standard convenience method to convert a string to markdown.
|
Non-standard convenience method to convert a string to markdown.
|
||||||
|
|
|
||||||
76
packages/markitdown/tests/test_html_selector.py
Normal file
76
packages/markitdown/tests/test_html_selector.py
Normal file
|
|
@ -0,0 +1,76 @@
|
||||||
|
import io
|
||||||
|
import pytest
|
||||||
|
from markitdown.converters._html_converter import HtmlConverter
|
||||||
|
from markitdown._stream_info import StreamInfo
|
||||||
|
|
||||||
|
# Sample HTML to test selector scoping
|
||||||
|
SAMPLE_HTML = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<header>Skip Me</header>
|
||||||
|
<article class="entry">
|
||||||
|
<h1>Title</h1>
|
||||||
|
<p>Body text.</p>
|
||||||
|
</article>
|
||||||
|
<footer>Also Skip</footer>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_selector_extracts_only_matching_nodes():
|
||||||
|
converter = HtmlConverter()
|
||||||
|
# Use the convenience method to convert a string with selector
|
||||||
|
result = converter.convert_string(SAMPLE_HTML, selector="article.entry")
|
||||||
|
md = result.markdown
|
||||||
|
# Print the markdown for inspection
|
||||||
|
print("\n--- Extracted Markdown (test_selector_extracts_only_matching_nodes) ---\n")
|
||||||
|
print(md)
|
||||||
|
# Only the article content should appear
|
||||||
|
assert "Title" in md
|
||||||
|
assert "Body text." in md
|
||||||
|
assert "Skip Me" not in md
|
||||||
|
assert "Also Skip" not in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_selector_no_match_raises():
|
||||||
|
converter = HtmlConverter()
|
||||||
|
# Non-existing selector should raise a ValueError
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
converter.convert_string(SAMPLE_HTML, selector=".does-not-exist")
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_selector_returns_full_content():
|
||||||
|
converter = HtmlConverter()
|
||||||
|
# Without selector, header and footer should remain
|
||||||
|
result = converter.convert_string(SAMPLE_HTML)
|
||||||
|
md = result.markdown
|
||||||
|
# Print the markdown for inspection
|
||||||
|
print("\n--- Extracted Markdown (test_no_selector_returns_full_content) ---\n")
|
||||||
|
print(md)
|
||||||
|
assert "Skip Me" in md
|
||||||
|
assert "Title" in md
|
||||||
|
assert "Body text." in md
|
||||||
|
assert "Also Skip" in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_method_with_stream_and_selector():
|
||||||
|
converter = HtmlConverter()
|
||||||
|
html_bytes = SAMPLE_HTML.encode("utf-8")
|
||||||
|
stream = io.BytesIO(html_bytes)
|
||||||
|
stream_info = StreamInfo(
|
||||||
|
mimetype="text/html",
|
||||||
|
extension=".html",
|
||||||
|
charset="utf-8",
|
||||||
|
url=None,
|
||||||
|
)
|
||||||
|
# Directly call convert(), passing selector
|
||||||
|
result = converter.convert(stream, stream_info, selector="article.entry")
|
||||||
|
md = result.markdown
|
||||||
|
# Print the markdown for inspection
|
||||||
|
print("\n--- Extracted Markdown (test_convert_method_with_stream_and_selector) ---\n")
|
||||||
|
print(md)
|
||||||
|
assert "Title" in md
|
||||||
|
assert "Body text." in md
|
||||||
|
assert "Skip Me" not in md
|
||||||
|
assert "Also Skip" not in md
|
||||||
Loading…
Reference in a new issue