Fix markitdown.convert_stream handling of leading blanks

Fixes #222

Address issue with `markitdown.convert_stream` crashing on input with leading blank characters or line breaks.

* Modify `convert_stream` function in `src/markitdown/_markitdown.py` to strip leading blank characters or line breaks from the input stream using a new helper function `_strip_leading_blanks`.
* Add a test case in `tests/test_markitdown.py` to verify that `markitdown.convert_stream` handles input with leading blank characters or line breaks correctly.

---

For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown/issues/222?shareId=XXXX-XXXX-XXXX-XXXX).
This commit is contained in:
Will 保哥 2024-12-28 01:19:07 +08:00
parent 125e206047
commit bf72b4a38d
2 changed files with 15 additions and 1 deletions

View file

@ -1344,7 +1344,7 @@ class MarkItDown:
result = None
try:
# Write to the temporary file
content = stream.read()
content = self._strip_leading_blanks(stream.read())
if isinstance(content, str):
fh.write(content.encode("utf-8"))
else:
@ -1367,6 +1367,10 @@ class MarkItDown:
return result
def _strip_leading_blanks(self, content: bytes) -> bytes:
"""Helper function to strip leading blank characters or line breaks from content."""
return content.lstrip()
def convert_url(
self, url: str, **kwargs: Any
) -> DocumentConverterResult: # TODO: fix kwargs type

View file

@ -300,6 +300,15 @@ def test_markitdown_llm() -> None:
assert test_string in result.text_content.lower()
def test_markitdown_strip_leading_blanks() -> None:
markitdown = MarkItDown()
# Test input with leading blank characters
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
assert "<h1>Test</h1>" in result.text_content
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
@ -307,3 +316,4 @@ if __name__ == "__main__":
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
test_markitdown_strip_leading_blanks()