Fix markitdown.convert_stream handling of leading blanks
Fixes #222 Address issue with `markitdown.convert_stream` crashing on input with leading blank characters or line breaks. * Modify `convert_stream` function in `src/markitdown/_markitdown.py` to strip leading blank characters or line breaks from the input stream using a new helper function `_strip_leading_blanks`. * Add a test case in `tests/test_markitdown.py` to verify that `markitdown.convert_stream` handles input with leading blank characters or line breaks correctly. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown/issues/222?shareId=XXXX-XXXX-XXXX-XXXX).
This commit is contained in:
parent
125e206047
commit
bf72b4a38d
2 changed files with 15 additions and 1 deletions
|
|
@ -1344,7 +1344,7 @@ class MarkItDown:
|
|||
result = None
|
||||
try:
|
||||
# Write to the temporary file
|
||||
content = stream.read()
|
||||
content = self._strip_leading_blanks(stream.read())
|
||||
if isinstance(content, str):
|
||||
fh.write(content.encode("utf-8"))
|
||||
else:
|
||||
|
|
@ -1367,6 +1367,10 @@ class MarkItDown:
|
|||
|
||||
return result
|
||||
|
||||
def _strip_leading_blanks(self, content: bytes) -> bytes:
|
||||
"""Helper function to strip leading blank characters or line breaks from content."""
|
||||
return content.lstrip()
|
||||
|
||||
def convert_url(
|
||||
self, url: str, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: fix kwargs type
|
||||
|
|
|
|||
|
|
@ -300,6 +300,15 @@ def test_markitdown_llm() -> None:
|
|||
assert test_string in result.text_content.lower()
|
||||
|
||||
|
||||
def test_markitdown_strip_leading_blanks() -> None:
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Test input with leading blank characters
|
||||
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
||||
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
|
||||
assert "<h1>Test</h1>" in result.text_content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_markitdown_remote()
|
||||
|
|
@ -307,3 +316,4 @@ if __name__ == "__main__":
|
|||
test_markitdown_exiftool()
|
||||
test_markitdown_deprecation()
|
||||
test_markitdown_llm()
|
||||
test_markitdown_strip_leading_blanks()
|
||||
|
|
|
|||
Loading…
Reference in a new issue