Experimeting with new signaures.

This commit is contained in:
Adam Fourney 2025-03-03 23:01:16 -08:00
parent e43632b048
commit 7bc6d827ee
5 changed files with 519 additions and 221 deletions

View file

@ -5,6 +5,7 @@
from .__about__ import __version__ from .__about__ import __version__
from ._markitdown import MarkItDown from ._markitdown import MarkItDown
from ._base_converter import DocumentConverterResult, BaseDocumentConverter from ._base_converter import DocumentConverterResult, BaseDocumentConverter
from ._stream_info import StreamInfo
from ._exceptions import ( from ._exceptions import (
MarkItDownException, MarkItDownException,
MissingDependencyException, MissingDependencyException,
@ -25,4 +26,5 @@ __all__ = [
"FailedConversionAttempt", "FailedConversionAttempt",
"FileConversionException", "FileConversionException",
"UnsupportedFormatException", "UnsupportedFormatException",
"StreamInfo",
] ]

View file

@ -1,3 +1,4 @@
from ._stream_info import StreamInfo
from typing import Any, Union, BinaryIO, Optional from typing import Any, Union, BinaryIO, Optional
@ -13,6 +14,9 @@ class DocumentConverterResult:
""" """
Initialize the DocumentConverterResult. Initialize the DocumentConverterResult.
The only required parameter is the converted Markdown text.
The title, and any other metadata that may be added in the future, are optional.
Parameters: Parameters:
- markdown: The converted Markdown text. - markdown: The converted Markdown text.
- title: Optional title of the document. - title: Optional title of the document.
@ -72,27 +76,25 @@ class BaseDocumentConverter:
def convert( def convert(
self, self,
file_stream, file_stream: BinaryIO,
*, stream_info: StreamInfo,
mime_type: str = "application/octet-stream", **kwargs: Any, # Options to pass to the converter
file_extension: Optional[str] = None,
charset: Optional[str] = None,
**kwargs: Any,
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
""" """
Convert a document to Markdown text, or return None if the converter Convert a document to Markdown text, or return None if the converter
cannot handle the document (causing the next converter to be tried). cannot handle the document (causing the next converter to be tried).
The determination of whether a converter can handle a document is primarily based on The determination of whether a converter can handle a document is primarily based on
the provided MIME type. The file extension can serve as a secondary check if the the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the a secondary check if the MIME type is not sufficiently specific
chatset is used to determine the encoding of the file content in cases of text/* (e.g., application/octet-stream). In the case of data retreived via HTTP, the
`steam_info.url` might also be referenced to guide conversion (e.g., special-handling
for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
of the file content in cases of text/*
Prameters: Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
- mime_type: The MIME type of the file. Default is "application/octet-stream". - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
- file_extension: The file extension of the file. Default is None.
- charset: The character set of the file. Default is None.
- kwargs: Additional keyword arguments for the converter. - kwargs: Additional keyword arguments for the converter.
Returns: Returns:

View file

@ -6,8 +6,9 @@ import sys
import tempfile import tempfile
import warnings import warnings
import traceback import traceback
import io
from importlib.metadata import entry_points from importlib.metadata import entry_points
from typing import Any, List, Optional, Union from typing import Any, List, Optional, Union, BinaryIO
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse from urllib.parse import urlparse
from warnings import warn from warnings import warn
@ -16,6 +17,8 @@ from warnings import warn
import puremagic import puremagic
import requests import requests
from ._stream_info import StreamInfo
from .converters import ( from .converters import (
DocumentConverter, DocumentConverter,
PlainTextConverter, PlainTextConverter,
@ -175,12 +178,17 @@ class MarkItDown:
warn("Plugins converters are already enabled.", RuntimeWarning) warn("Plugins converters are already enabled.", RuntimeWarning)
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self,
source: Union[str, requests.Response, Path, BinaryIO],
*,
stream_info: Optional[StreamInfo] = None,
**kwargs: Any,
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
""" """
Args: Args:
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object - source: can be a path (str or Path), url, or a requests.response object
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) - stream_info: optional stream info to use for the conversion. If None, infer from source
- kwargs: additional arguments to pass to the converter
""" """
# Local path or url # Local path or url
@ -192,68 +200,112 @@ class MarkItDown:
): ):
return self.convert_url(source, **kwargs) return self.convert_url(source, **kwargs)
else: else:
return self.convert_local(source, **kwargs) return self.convert_local(source, stream_info=stream_info, **kwargs)
# Path object
elif isinstance(source, Path):
return self.convert_local(source, stream_info=stream_info, **kwargs)
# Request response # Request response
elif isinstance(source, requests.Response): elif isinstance(source, requests.Response):
return self.convert_response(source, **kwargs) return self.convert_response(source, **kwargs)
elif isinstance(source, Path): # Binary stream
return self.convert_local(source, **kwargs) elif (
hasattr(source, "read")
and callable(source.read)
and not isinstance(source, io.TextIOBase)
):
return self.convert_stream(source, **kwargs)
else:
raise TypeError(
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
)
def convert_local( def convert_local(
self, path: Union[str, Path], **kwargs: Any self,
) -> DocumentConverterResult: # TODO: deal with kwargs path: Union[str, Path],
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None, # Deprecated -- use stream_info
url: Optional[str] = None, # Deprecated -- use stream_info
**kwargs: Any,
) -> DocumentConverterResult:
if isinstance(path, Path): if isinstance(path, Path):
path = str(path) path = str(path)
# Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else []
# Get extension alternatives from the path and puremagic # Build a base StreamInfo object from which to start guesses
base, ext = os.path.splitext(path) base_stream_info = StreamInfo(
self._append_ext(extensions, ext) local_path=path,
extension=os.path.splitext(path)[1],
filename=os.path.basename(path),
)
for g in self._guess_ext_magic(path): # Extend the base_stream_info with any additional info from the arguments
self._append_ext(extensions, g) if stream_info is not None:
base_stream_info = base_stream_info.copy_and_update(stream_info)
# Convert if file_extension is not None:
return self._convert(path, extensions, **kwargs) # Deprecated -- use stream_info
base_stream_info = base_stream_info.copy_and_update(
extension=file_extension
)
if url is not None:
# Deprecated -- use stream_info
base_stream_info = base_stream_info.copy_and_update(url=url)
with open(path, "rb") as fh:
# Prepare a list of configurations to try, starting with the base_stream_info
guesses: List[StreamInfo] = [base_stream_info]
for guess in StreamInfo.guess_from_stream(
file_stream=fh, filename_hint=path
):
guesses.append(base_stream_info.copy_and_update(guess))
return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
# TODO what should stream's type be?
def convert_stream( def convert_stream(
self, stream: Any, **kwargs: Any self,
) -> DocumentConverterResult: # TODO: deal with kwargs stream: BinaryIO,
# Prepare a list of extensions to try (in order of priority) *,
ext = kwargs.get("file_extension") stream_info: Optional[StreamInfo] = None,
extensions = [ext] if ext is not None else [] file_extension: Optional[str] = None, # Deprecated -- use stream_info
url: Optional[str] = None, # Deprecated -- use stream_info
**kwargs: Any,
) -> DocumentConverterResult:
guesses: List[StreamInfo] = []
# Save the file locally to a temporary file. It will be deleted before this method exits # Do we have anything on which to base a guess?
handle, temp_path = tempfile.mkstemp() base_guess = None
fh = os.fdopen(handle, "wb") if stream_info is not None or file_extension is not None or url is not None:
result = None base_guess = stream_info if stream_info is not None else StreamInfo()
try: if file_extension is not None:
# Write to the temporary file # Deprecated -- use stream_info
content = stream.read() base_guess = base_guess.copy_and_update(extension=file_extension)
if isinstance(content, str): if url is not None:
fh.write(content.encode("utf-8")) # Deprecated -- use stream_info
else: base_guess = base_guess.copy_and_update(url=url)
fh.write(content)
fh.close()
# Use puremagic to check for more extension options # Append the base guess, if it's non-trivial
for g in self._guess_ext_magic(temp_path): if base_guess is not None:
self._append_ext(extensions, g) if base_guess.mimetype is not None or base_guess.extension is not None:
guesses.append(base_guess)
else:
# Create a base guess with no information
base_guess = StreamInfo()
# Convert # Create a placeholder filename to help with guessing
result = self._convert(temp_path, extensions, **kwargs) placeholder_filename = None
# Clean up if base_guess.filename is not None:
finally: placeholder_filename = base_guess.filename
try: elif base_guess.extension is not None:
fh.close() placeholder_filename = "placeholder" + base_guess.extension
except Exception:
pass
os.unlink(temp_path)
return result # Add guesses based on stream content
for guess in StreamInfo.guess_from_stream(
file_stream=stream, filename_hint=placeholder_filename
):
guesses.append(base_guess.copy_and_update(guess))
# Perform the conversion
return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
def convert_url( def convert_url(
self, url: str, **kwargs: Any self, url: str, **kwargs: Any
@ -264,173 +316,197 @@ class MarkItDown:
return self.convert_response(response, **kwargs) return self.convert_response(response, **kwargs)
def convert_response( def convert_response(
self, response: requests.Response, **kwargs: Any self,
) -> DocumentConverterResult: # TODO fix kwargs type response: requests.Response,
# Prepare a list of extensions to try (in order of priority) *,
ext = kwargs.get("file_extension") stream_info: Optional[StreamInfo] = None,
extensions = [ext] if ext is not None else [] file_extension: Optional[str] = None, # Deprecated -- use stream_info
url: Optional[str] = None, # Deprecated -- use stream_info
# Guess from the mimetype **kwargs: Any,
content_type = response.headers.get("content-type", "").split(";")[0]
self._append_ext(extensions, mimetypes.guess_extension(content_type))
# Read the content disposition if there is one
content_disposition = response.headers.get("content-disposition", "")
m = re.search(r"filename=([^;]+)", content_disposition)
if m:
base, ext = os.path.splitext(m.group(1).strip("\"'"))
self._append_ext(extensions, ext)
# Read from the extension from the path
base, ext = os.path.splitext(urlparse(response.url).path)
self._append_ext(extensions, ext)
# Save the file locally to a temporary file. It will be deleted before this method exits
handle, temp_path = tempfile.mkstemp()
fh = os.fdopen(handle, "wb")
result = None
try:
# Download the file
for chunk in response.iter_content(chunk_size=512):
fh.write(chunk)
fh.close()
# Use puremagic to check for more extension options
for g in self._guess_ext_magic(temp_path):
self._append_ext(extensions, g)
# Convert
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up
finally:
try:
fh.close()
except Exception:
pass
os.unlink(temp_path)
return result
def _convert(
self, local_path: str, extensions: List[Union[str, None]], **kwargs
) -> DocumentConverterResult: ) -> DocumentConverterResult:
res: Union[None, DocumentConverterResult] = None # If there is a content-type header, get the mimetype and charset (if present)
mimetype: Optional[str] = None
charset: Optional[str] = None
# Keep track of which converters throw exceptions if "content-type" in response.headers:
failed_attempts: List[FailedConversionAttempt] = [] parts = response.headers["content-type"].split(";")
mimetype = parts.pop(0).strip()
for part in parts:
if part.strip().startswith("charset="):
_charset = part.split("=")[1].strip()
if len(_charset) > 0:
charset = _charset
# Create a copy of the page_converters list, sorted by priority. # If there is a content-disposition header, get the filename and possibly the extension
# We do this with each call to _convert because the priority of converters may change between calls. filename: Optional[str] = None
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. extension: Optional[str] = None
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority) if "content-disposition" in response.headers:
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
if m:
filename = m.group(1).strip("\"'")
_, _extension = os.path.splitext(filename)
if len(_extension) > 0:
extension = _extension
for ext in extensions + [None]: # Try last with no extension # If there is still no filename, try to read it from the url
for converter in sorted_converters: if filename is None:
_kwargs = copy.deepcopy(kwargs) parsed_url = urlparse(response.url)
_, _extension = os.path.splitext(parsed_url.path)
if len(_extension) > 0: # Looks like this might be a file!
filename = os.path.basename(parsed_url.path)
extension = _extension
# Overwrite file_extension appropriately # Create an initial guess from all this information
if ext is None: base_guess = StreamInfo(
if "file_extension" in _kwargs: mimetype=mimetype,
del _kwargs["file_extension"] charset=charset,
else: filename=filename,
_kwargs.update({"file_extension": ext}) extension=extension,
url=response.url,
# Copy any additional global options
if "llm_client" not in _kwargs and self._llm_client is not None:
_kwargs["llm_client"] = self._llm_client
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
_kwargs["exiftool_path"] = self._exiftool_path
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
# If we hit an error log it and keep trying
try:
res = converter.convert(local_path, **_kwargs)
except Exception:
failed_attempts.append(
FailedConversionAttempt(
converter=converter, exc_info=sys.exc_info()
)
)
if res is not None:
# Normalize the content
res.text_content = "\n".join(
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
)
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
# Todo
return res
# If we got this far without success, report any exceptions
if len(failed_attempts) > 0:
raise FileConversionException(attempts=failed_attempts)
# Nothing can handle it!
raise UnsupportedFormatException(
f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
) )
def _append_ext(self, extensions, ext): # Update with any additional info from the arguments
"""Append a unique non-None, non-empty extension to a list of extensions.""" if stream_info is not None:
if ext is None: base_guess = base_guess.copy_and_update(stream_info)
return if file_extension is not None:
ext = ext.strip() # Deprecated -- use stream_info
if ext == "": base_guess = base_guess.copy_and_update(extension=file_extension)
return if url is not None:
if ext in extensions: # Deprecated -- use stream_info
return base_guess = base_guess.copy_and_update(url=url)
extensions.append(ext)
# Add the guess if its non-trivial
guesses: List[StreamInfo] = []
if base_guess.mimetype is not None or base_guess.extension is not None:
guesses.append(base_guess)
# Read into BytesIO
buffer = io.BytesIO()
for chunk in response.iter_content(chunk_size=512):
buffer.write(chunk)
buffer.seek(0)
# Create a placeholder filename to help with guessing
placeholder_filename = None
if base_guess.filename is not None:
placeholder_filename = base_guess.filename
elif base_guess.extension is not None:
placeholder_filename = "placeholder" + base_guess.extension
# Add guesses based on stream content
for guess in StreamInfo.guess_from_stream(
file_stream=buffer, filename_hint=placeholder_filename
):
guesses.append(base_guess.copy_and_update(guess))
# Convert
return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
def _convert(
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
) -> DocumentConverterResult:
# Lazily create a temporary file, if needed, for backward compatibility
# This is to support a deprecated feature, and will be removed in the future
temp_file = None
def get_temp_file():
nonlocal temp_file
if temp_file is not None:
return temp_file
else:
cur_pos = file_stream.tell()
handle, temp_file = tempfile.mkstemp()
fh = os.fdopen(handle, "wb")
file_stream.seek(0)
fh.write(file_stream.read())
file_stream.seek(cur_pos)
fh.close()
return temp_file
def _guess_ext_magic(self, path):
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
# Use puremagic to guess
try: try:
guesses = puremagic.magic_file(path) res: Union[None, DocumentConverterResult] = None
# Fix for: https://github.com/microsoft/markitdown/issues/222 # Keep track of which converters throw exceptions
# If there are no guesses, then try again after trimming leading ASCII whitespaces. failed_attempts: List[FailedConversionAttempt] = []
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
# (space, tab, newline, carriage return, vertical tab, form feed).
if len(guesses) == 0:
with open(path, "rb") as file:
while True:
char = file.read(1)
if not char: # End of file
break
if not char.isspace():
file.seek(file.tell() - 1)
break
try:
guesses = puremagic.magic_stream(file)
except puremagic.main.PureError:
pass
extensions = list() # Create a copy of the page_converters list, sorted by priority.
for g in guesses: # We do this with each call to _convert because the priority of converters may change between calls.
ext = g.extension.strip() # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
if len(ext) > 0: sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
if not ext.startswith("."):
ext = "." + ext for file_info in stream_info_guesses + [None]:
if ext not in extensions: for converter in sorted_converters:
extensions.append(ext) _kwargs = copy.deepcopy(kwargs)
return extensions
except FileNotFoundError: # Copy any additional global options
pass if "llm_client" not in _kwargs and self._llm_client is not None:
except IsADirectoryError: _kwargs["llm_client"] = self._llm_client
pass
except PermissionError: if "llm_model" not in _kwargs and self._llm_model is not None:
pass _kwargs["llm_model"] = self._llm_model
return []
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
if (
"exiftool_path" not in _kwargs
and self._exiftool_path is not None
):
_kwargs["exiftool_path"] = self._exiftool_path
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
# Add backwards compatibility
if isinstance(converter, DocumentConverter):
if file_info is not None:
# Legacy converters need a file_extension
if file_info.extension is not None:
_kwargs["file_extension"] = file_info.extension
# And benefit from urls, when available
if file_info.url is not None:
_kwargs["url"] = file_info.url
try:
res = converter.convert(get_temp_file(), **_kwargs)
except Exception:
failed_attempts.append(
FailedConversionAttempt(
converter=converter, exc_info=sys.exc_info()
)
)
else:
raise NotImplementedError("TODO")
if res is not None:
# Normalize the content
res.text_content = "\n".join(
[
line.rstrip()
for line in re.split(r"\r?\n", res.text_content)
]
)
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
return res
# If we got this far without success, report any exceptions
if len(failed_attempts) > 0:
raise FileConversionException(attempts=failed_attempts)
# Nothing can handle it!
raise UnsupportedFormatException(
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
)
finally:
# Clean up the temporary file
if temp_file is not None:
try:
os.unlink(temp_file)
except Exception:
pass
def register_page_converter(self, converter: DocumentConverter) -> None: def register_page_converter(self, converter: DocumentConverter) -> None:
"""DEPRECATED: User register_converter instead.""" """DEPRECATED: User register_converter instead."""

View file

@ -0,0 +1,105 @@
import puremagic
from dataclasses import dataclass, asdict
from typing import Optional, BinaryIO, List, TypeVar, Type
# This is a workaround for Self not being available in Python 3.10
T = TypeVar("T", bound="StreamInfo")
# Mimetype substitutions table
MIMETYPE_SUBSTITUTIONS = {
"application/excel": "application/vnd.ms-excel",
"application/mspowerpoint": "application/vnd.ms-powerpoint",
}
@dataclass(kw_only=True, frozen=True)
class StreamInfo:
"""The StreamInfo class is used to store information about a file stream.
All fields can be None, and will depend on how the stream was opened.
"""
mimetype: Optional[str] = None
extension: Optional[str] = None
charset: Optional[str] = None
filename: Optional[
str
] = None # From local path, url, or Content-Disposition header
local_path: Optional[str] = None # If read from disk
url: Optional[str] = None # If read from url
def copy_and_update(self, *args, **kwargs):
"""Copy the StreamInfo object and update it with the given StreamInfo
instance and/or other keyword arguments."""
new_info = asdict(self)
for si in args:
assert isinstance(si, StreamInfo)
new_info.update({k: v for k, v in asdict(si).items() if v is not None})
if len(kwargs) > 0:
new_info.update(kwargs)
return StreamInfo(**new_info)
@classmethod
def guess_from_stream(
cls: Type[T], file_stream: BinaryIO, *, filename_hint: Optional[str] = None
) -> List[T]:
"""
Guess StreamInfo properties (mostly mimetype and extension) from a stream.
Args:
- stream: The stream to guess the StreamInfo from.
- filename_hint [Optional]: A filename hint to help with the guessing (may be a placeholder, and not actually be the file name)
Returns a list of StreamInfo objects in order of confidence.
"""
guesses: List[StreamInfo] = []
def _puremagic(
file_stream, filename_hint
) -> puremagic.main.PureMagicWithConfidence:
"""Wrap guesses to handle exceptions."""
try:
return puremagic.magic_stream(file_stream, filename=filename_hint)
except puremagic.main.PureError as e:
return []
cur_pos = file_stream.tell()
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
if len(type_guesses) == 0:
# Fix for: https://github.com/microsoft/markitdown/issues/222
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
# (space, tab, newline, carriage return, vertical tab, form feed).
# Eat all the leading whitespace
file_stream.seek(cur_pos)
while True:
char = file_stream.read(1)
if not char: # End of file
break
if not char.isspace():
file_stream.seek(file_stream.tell() - 1)
break
# Try again
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
file_stream.seek(cur_pos)
# Convert and return the guesses
for guess in type_guesses:
kwargs: dict[str, str] = {}
if guess.extension:
kwargs["extension"] = guess.extension
if guess.mime_type:
kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get(
guess.mime_type, guess.mime_type
)
if len(kwargs) > 0:
# We don't add the filename_hint, because sometimes it's just a placeholder,
# and, in any case, doesn't add new information.
guesses.append(cls(**kwargs))
# Return the guesses
return guesses

View file

@ -8,7 +8,12 @@ import requests
from warnings import catch_warnings, resetwarnings from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException from markitdown import (
MarkItDown,
UnsupportedFormatException,
FileConversionException,
StreamInfo,
)
skip_remote = ( skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False True if os.environ.get("GITHUB_ACTIONS") else False
@ -162,6 +167,107 @@ def validate_strings(result, expected_strings, exclude_strings=None):
assert string not in text_content assert string not in text_content
def test_stream_info_operations() -> None:
"""Test operations performed on StreamInfo objects."""
stream_info_original = StreamInfo(
mimetype="mimetype.1",
extension="extension.1",
charset="charset.1",
filename="filename.1",
local_path="local_path.1",
url="url.1",
)
# Check updating all attributes by keyword
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
for keyword in keywords:
updated_stream_info = stream_info_original.copy_and_update(
**{keyword: f"{keyword}.2"}
)
# Make sure the targted attribute is updated
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
# Make sure the other attributes are unchanged
for k in keywords:
if k != keyword:
assert getattr(stream_info_original, k) == getattr(
updated_stream_info, k
)
# Check updating all attributes by passing a new StreamInfo object
keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
for keyword in keywords:
updated_stream_info = stream_info_original.copy_and_update(
StreamInfo(**{keyword: f"{keyword}.2"})
)
# Make sure the targted attribute is updated
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
# Make sure the other attributes are unchanged
for k in keywords:
if k != keyword:
assert getattr(stream_info_original, k) == getattr(
updated_stream_info, k
)
# Check mixing and matching
updated_stream_info = stream_info_original.copy_and_update(
StreamInfo(extension="extension.2", filename="filename.2"),
mimetype="mimetype.3",
charset="charset.3",
)
assert updated_stream_info.extension == "extension.2"
assert updated_stream_info.filename == "filename.2"
assert updated_stream_info.mimetype == "mimetype.3"
assert updated_stream_info.charset == "charset.3"
assert updated_stream_info.local_path == "local_path.1"
assert updated_stream_info.url == "url.1"
# Check multiple StreamInfo objects
updated_stream_info = stream_info_original.copy_and_update(
StreamInfo(extension="extension.4", filename="filename.5"),
StreamInfo(mimetype="mimetype.6", charset="charset.7"),
)
assert updated_stream_info.extension == "extension.4"
assert updated_stream_info.filename == "filename.5"
assert updated_stream_info.mimetype == "mimetype.6"
assert updated_stream_info.charset == "charset.7"
assert updated_stream_info.local_path == "local_path.1"
assert updated_stream_info.url == "url.1"
def test_stream_info_guesses() -> None:
"""Test StreamInfo guesses based on stream content."""
test_tuples = [
(
os.path.join(TEST_FILES_DIR, "test.xlsx"),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
),
(
os.path.join(TEST_FILES_DIR, "test.docx"),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
(
os.path.join(TEST_FILES_DIR, "test.pptx"),
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
),
(os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"),
]
for file_path, expected_mimetype in test_tuples:
with open(file_path, "rb") as f:
guesses = StreamInfo.guess_from_stream(
f, filename_hint=os.path.basename(file_path)
)
assert len(guesses) > 0
assert guesses[0].mimetype == expected_mimetype
assert guesses[0].extension == os.path.splitext(file_path)[1]
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
reason="do not run tests that query external urls", reason="do not run tests that query external urls",
@ -266,6 +372,11 @@ def test_markitdown_local() -> None:
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
validate_strings(result, JSON_TEST_STRINGS) validate_strings(result, JSON_TEST_STRINGS)
# Test input from a stream
input_data = b"<html><body><h1>Test</h1></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data))
assert "# Test" in result.text_content
# Test input with leading blank characters # Test input with leading blank characters
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>" input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data)) result = markitdown.convert_stream(io.BytesIO(input_data))
@ -342,9 +453,11 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_stream_info_operations()
test_stream_info_guesses()
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local()
test_exceptions() # test_exceptions()
test_markitdown_exiftool() # test_markitdown_exiftool()
# test_markitdown_llm() # test_markitdown_llm()
print("All tests passed!") print("All tests passed!")