black formatting

This commit is contained in:
Kenny Zhang 2025-03-05 16:54:51 -05:00
parent 30e5189581
commit 8c3dd01f2f
3 changed files with 17 additions and 17 deletions

View file

@ -20,9 +20,9 @@ class StreamInfo:
mimetype: Optional[str] = None mimetype: Optional[str] = None
extension: Optional[str] = None extension: Optional[str] = None
charset: Optional[str] = None charset: Optional[str] = None
filename: Optional[ filename: Optional[str] = (
str None # From local path, url, or Content-Disposition header
] = None # From local path, url, or Content-Disposition header )
local_path: Optional[str] = None # If read from disk local_path: Optional[str] = None # If read from disk
url: Optional[str] = None # If read from url url: Optional[str] = None # If read from url

View file

@ -92,7 +92,7 @@ class DocumentIntelligenceConverter(DocumentConverter):
api_version=self.api_version, api_version=self.api_version,
credential=DefaultAzureCredential(), credential=DefaultAzureCredential(),
) )
def accepts( def accepts(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,

View file

@ -399,29 +399,29 @@ def test_markitdown_local() -> None:
def test_markitdown_streams() -> None: def test_markitdown_streams() -> None:
markitdown = MarkItDown() markitdown = MarkItDown()
# Test PDF processing # Test PDF processing
with open(os.path.join(TEST_FILES_DIR, "test.pdf"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test.pdf"), "rb") as f:
result = markitdown.convert(f, file_extension=".pdf") result = markitdown.convert(f, file_extension=".pdf")
validate_strings(result, PDF_TEST_STRINGS) validate_strings(result, PDF_TEST_STRINGS)
# Test XLSX processing # Test XLSX processing
with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f:
result = markitdown.convert(f, file_extension=".xlsx") result = markitdown.convert(f, file_extension=".xlsx")
validate_strings(result, XLSX_TEST_STRINGS) validate_strings(result, XLSX_TEST_STRINGS)
# Test XLS processing # Test XLS processing
with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f:
result = markitdown.convert(f, file_extension=".xls") result = markitdown.convert(f, file_extension=".xls")
for test_string in XLS_TEST_STRINGS: for test_string in XLS_TEST_STRINGS:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
# Test DOCX processing # Test DOCX processing
with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f:
result = markitdown.convert(f, file_extension=".docx") result = markitdown.convert(f, file_extension=".docx")
validate_strings(result, DOCX_TEST_STRINGS) validate_strings(result, DOCX_TEST_STRINGS)
# Test DOCX processing, with comments # Test DOCX processing, with comments
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
result = markitdown.convert( result = markitdown.convert(
@ -430,47 +430,47 @@ def test_markitdown_streams() -> None:
style_map="comment-reference => ", style_map="comment-reference => ",
) )
validate_strings(result, DOCX_COMMENT_TEST_STRINGS) validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test DOCX processing, with comments and setting style_map on init # Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
result = markitdown_with_style_map.convert(f, file_extension=".docx") result = markitdown_with_style_map.convert(f, file_extension=".docx")
validate_strings(result, DOCX_COMMENT_TEST_STRINGS) validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test PPTX processing # Test PPTX processing
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f:
result = markitdown.convert(f, file_extension=".pptx") result = markitdown.convert(f, file_extension=".pptx")
validate_strings(result, PPTX_TEST_STRINGS) validate_strings(result, PPTX_TEST_STRINGS)
# Test HTML processing # Test HTML processing
with open(os.path.join(TEST_FILES_DIR, "test_blog.html"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test_blog.html"), "rb") as f:
result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL) result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL)
validate_strings(result, BLOG_TEST_STRINGS) validate_strings(result, BLOG_TEST_STRINGS)
# Test Wikipedia processing # Test Wikipedia processing
with open(os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rb") as f:
result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL) result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL)
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
# Test Bing processing # Test Bing processing
with open(os.path.join(TEST_FILES_DIR, "test_serp.html"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test_serp.html"), "rb") as f:
result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL) result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL)
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
# Test RSS processing # Test RSS processing
with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f:
result = markitdown.convert(f, file_extension=".xml") result = markitdown.convert(f, file_extension=".xml")
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS: for test_string in RSS_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
# Test MSG (Outlook email) processing # Test MSG (Outlook email) processing
with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f:
result = markitdown.convert(f, file_extension=".msg") result = markitdown.convert(f, file_extension=".msg")
validate_strings(result, MSG_TEST_STRINGS) validate_strings(result, MSG_TEST_STRINGS)
# Test JSON processing # Test JSON processing
with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f: with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f:
result = markitdown.convert(f, file_extension=".json") result = markitdown.convert(f, file_extension=".json")