Fix markitdown.convert_stream handling of leading blanks
Fixes #222 Address issue with `markitdown.convert_stream` crashing on input with leading blank characters or line breaks. * Modify `convert_stream` function in `src/markitdown/_markitdown.py` to strip leading blank characters or line breaks from the input stream using a new helper function `_strip_leading_blanks`. * Add a test case in `tests/test_markitdown.py` to verify that `markitdown.convert_stream` handles input with leading blank characters or line breaks correctly. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown/issues/222?shareId=XXXX-XXXX-XXXX-XXXX).
This commit is contained in:
parent
125e206047
commit
bf72b4a38d
2 changed files with 15 additions and 1 deletions
|
|
@ -1344,7 +1344,7 @@ class MarkItDown:
|
||||||
result = None
|
result = None
|
||||||
try:
|
try:
|
||||||
# Write to the temporary file
|
# Write to the temporary file
|
||||||
content = stream.read()
|
content = self._strip_leading_blanks(stream.read())
|
||||||
if isinstance(content, str):
|
if isinstance(content, str):
|
||||||
fh.write(content.encode("utf-8"))
|
fh.write(content.encode("utf-8"))
|
||||||
else:
|
else:
|
||||||
|
|
@ -1367,6 +1367,10 @@ class MarkItDown:
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _strip_leading_blanks(self, content: bytes) -> bytes:
|
||||||
|
"""Helper function to strip leading blank characters or line breaks from content."""
|
||||||
|
return content.lstrip()
|
||||||
|
|
||||||
def convert_url(
|
def convert_url(
|
||||||
self, url: str, **kwargs: Any
|
self, url: str, **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: fix kwargs type
|
) -> DocumentConverterResult: # TODO: fix kwargs type
|
||||||
|
|
|
||||||
|
|
@ -300,6 +300,15 @@ def test_markitdown_llm() -> None:
|
||||||
assert test_string in result.text_content.lower()
|
assert test_string in result.text_content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_markitdown_strip_leading_blanks() -> None:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
# Test input with leading blank characters
|
||||||
|
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
||||||
|
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
|
||||||
|
assert "<h1>Test</h1>" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
|
|
@ -307,3 +316,4 @@ if __name__ == "__main__":
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
test_markitdown_deprecation()
|
test_markitdown_deprecation()
|
||||||
test_markitdown_llm()
|
test_markitdown_llm()
|
||||||
|
test_markitdown_strip_leading_blanks()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue