Experimeting with new signaures.

2025-03-03 23:01:16 -08:00 · 2025-03-03 23:01:16 -08:00 · 7bc6d827ee
commit 7bc6d827ee
parent e43632b048
5 changed files with 519 additions and 221 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -5,6 +5,7 @@
 from .__about__ import __version__
 from ._markitdown import MarkItDown
 from ._base_converter import DocumentConverterResult, BaseDocumentConverter
 from ._stream_info import StreamInfo
 from ._exceptions import (
    MarkItDownException,
    MissingDependencyException,
@ -25,4 +26,5 @@ __all__ = [
    "FailedConversionAttempt",
    "FileConversionException",
    "UnsupportedFormatException",
    "StreamInfo",
 ]
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -1,3 +1,4 @@
 from ._stream_info import StreamInfo
 from typing import Any, Union, BinaryIO, Optional
@ -13,6 +14,9 @@ class DocumentConverterResult:
        """
        Initialize the DocumentConverterResult.
        The only required parameter is the converted Markdown text.
        The title, and any other metadata that may be added in the future, are optional.
        Parameters:
        - markdown: The converted Markdown text.
        - title: Optional title of the document.
@ -72,27 +76,25 @@ class BaseDocumentConverter:
    def convert(
        self,
-        file_stream,
+        file_stream: BinaryIO,
-        *,
+        stream_info: StreamInfo,
-        mime_type: str = "application/octet-stream",
+        **kwargs: Any,  # Options to pass to the converter
        file_extension: Optional[str] = None,
        charset: Optional[str] = None,
        **kwargs: Any,
    ) -> Union[None, DocumentConverterResult]:
        """
        Convert a document to Markdown text, or return None if the converter
        cannot handle the document (causing the next converter to be tried).
        The determination of whether a converter can handle a document is primarily based on
-        the provided MIME type. The file extension can serve as a secondary check if the
+        the provided `stream_info.mimetype`. The field `stream_info.extension` can serve as
-        MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the
+        a secondary check if the MIME type is not sufficiently specific
-        chatset is used to determine the encoding of the file content in cases of text/*
+        (e.g., application/octet-stream). In the case of data retreived via HTTP, the
        `steam_info.url` might also be referenced to guide conversion (e.g., special-handling
        for Wikipedia). Finally, the `stream_info.chatset` is used to determine the encoding
        of the file content in cases of text/*
        Prameters:
        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
-        - mime_type: The MIME type of the file. Default is "application/octet-stream".
+        - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
        - file_extension: The file extension of the file. Default is None.
        - charset: The character set of the file. Default is None.
        - kwargs: Additional keyword arguments for the converter.
        Returns:
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -6,8 +6,9 @@ import sys
 import tempfile
 import warnings
 import traceback
 import io
 from importlib.metadata import entry_points
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Union, BinaryIO
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
@ -16,6 +17,8 @@ from warnings import warn
 import puremagic
 import requests
 from ._stream_info import StreamInfo
 from .converters import (
    DocumentConverter,
    PlainTextConverter,
@ -175,12 +178,17 @@ class MarkItDown:
            warn("Plugins converters are already enabled.", RuntimeWarning)
    def convert(
-        self, source: Union[str, requests.Response, Path], **kwargs: Any
+        self,
        source: Union[str, requests.Response, Path, BinaryIO],
        *,
        stream_info: Optional[StreamInfo] = None,
        **kwargs: Any,
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        """
        Args:
-            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
+            - source: can be a path (str or Path), url, or a requests.response object
-            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
+            - stream_info: optional stream info to use for the conversion. If None, infer from source
            - kwargs: additional arguments to pass to the converter
        """
        # Local path or url
@ -192,68 +200,112 @@ class MarkItDown:
            ):
                return self.convert_url(source, **kwargs)
            else:
-                return self.convert_local(source, **kwargs)
+                return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Path object
        elif isinstance(source, Path):
            return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Request response
        elif isinstance(source, requests.Response):
            return self.convert_response(source, **kwargs)
-        elif isinstance(source, Path):
+        # Binary stream
-            return self.convert_local(source, **kwargs)
+        elif (
            hasattr(source, "read")
            and callable(source.read)
            and not isinstance(source, io.TextIOBase)
        ):
            return self.convert_stream(source, **kwargs)
        else:
            raise TypeError(
                f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
            )
    def convert_local(
-        self, path: Union[str, Path], **kwargs: Any
+        self,
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        path: Union[str, Path],
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        url: Optional[str] = None,  # Deprecated -- use stream_info
        **kwargs: Any,
    ) -> DocumentConverterResult:
        if isinstance(path, Path):
            path = str(path)
        # Prepare a list of extensions to try (in order of priority)
        ext = kwargs.get("file_extension")
        extensions = [ext] if ext is not None else []
-        # Get extension alternatives from the path and puremagic
+        # Build a base StreamInfo object from which to start guesses
-        base, ext = os.path.splitext(path)
+        base_stream_info = StreamInfo(
-        self._append_ext(extensions, ext)
+            local_path=path,
            extension=os.path.splitext(path)[1],
            filename=os.path.basename(path),
        )
-        for g in self._guess_ext_magic(path):
+        # Extend the base_stream_info with any additional info from the arguments
-            self._append_ext(extensions, g)
+        if stream_info is not None:
            base_stream_info = base_stream_info.copy_and_update(stream_info)
-        # Convert
+        if file_extension is not None:
-        return self._convert(path, extensions, **kwargs)
+            # Deprecated -- use stream_info
            base_stream_info = base_stream_info.copy_and_update(
                extension=file_extension
            )
        if url is not None:
            # Deprecated -- use stream_info
            base_stream_info = base_stream_info.copy_and_update(url=url)
        with open(path, "rb") as fh:
            # Prepare a list of configurations to try, starting with the base_stream_info
            guesses: List[StreamInfo] = [base_stream_info]
            for guess in StreamInfo.guess_from_stream(
                file_stream=fh, filename_hint=path
            ):
                guesses.append(base_stream_info.copy_and_update(guess))
            return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
    # TODO what should stream's type be?
    def convert_stream(
-        self, stream: Any, **kwargs: Any
+        self,
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        stream: BinaryIO,
-        # Prepare a list of extensions to try (in order of priority)
+        *,
-        ext = kwargs.get("file_extension")
+        stream_info: Optional[StreamInfo] = None,
-        extensions = [ext] if ext is not None else []
+        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        url: Optional[str] = None,  # Deprecated -- use stream_info
        **kwargs: Any,
    ) -> DocumentConverterResult:
        guesses: List[StreamInfo] = []
-        # Save the file locally to a temporary file. It will be deleted before this method exits
+        # Do we have anything on which to base a guess?
-        handle, temp_path = tempfile.mkstemp()
+        base_guess = None
-        fh = os.fdopen(handle, "wb")
+        if stream_info is not None or file_extension is not None or url is not None:
-        result = None
+            base_guess = stream_info if stream_info is not None else StreamInfo()
-        try:
+            if file_extension is not None:
-            # Write to the temporary file
+                # Deprecated -- use stream_info
-            content = stream.read()
+                base_guess = base_guess.copy_and_update(extension=file_extension)
-            if isinstance(content, str):
+            if url is not None:
-                fh.write(content.encode("utf-8"))
+                # Deprecated -- use stream_info
-            else:
+                base_guess = base_guess.copy_and_update(url=url)
                fh.write(content)
            fh.close()
-            # Use puremagic to check for more extension options
+        # Append the base guess, if it's non-trivial
-            for g in self._guess_ext_magic(temp_path):
+        if base_guess is not None:
-                self._append_ext(extensions, g)
+            if base_guess.mimetype is not None or base_guess.extension is not None:
                guesses.append(base_guess)
        else:
            # Create a base guess with no information
            base_guess = StreamInfo()
-            # Convert
+        # Create a placeholder filename to help with guessing
-            result = self._convert(temp_path, extensions, **kwargs)
+        placeholder_filename = None
-        # Clean up
+        if base_guess.filename is not None:
-        finally:
+            placeholder_filename = base_guess.filename
-            try:
+        elif base_guess.extension is not None:
-                fh.close()
+            placeholder_filename = "placeholder" + base_guess.extension
            except Exception:
                pass
            os.unlink(temp_path)
-        return result
+        # Add guesses based on stream content
        for guess in StreamInfo.guess_from_stream(
            file_stream=stream, filename_hint=placeholder_filename
        ):
            guesses.append(base_guess.copy_and_update(guess))
        # Perform the conversion
        return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
    def convert_url(
        self, url: str, **kwargs: Any
@ -264,173 +316,197 @@ class MarkItDown:
        return self.convert_response(response, **kwargs)
    def convert_response(
-        self, response: requests.Response, **kwargs: Any
+        self,
-    ) -> DocumentConverterResult:  # TODO fix kwargs type
+        response: requests.Response,
-        # Prepare a list of extensions to try (in order of priority)
+        *,
-        ext = kwargs.get("file_extension")
+        stream_info: Optional[StreamInfo] = None,
-        extensions = [ext] if ext is not None else []
+        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
-
+        url: Optional[str] = None,  # Deprecated -- use stream_info
-        # Guess from the mimetype
+        **kwargs: Any,
        content_type = response.headers.get("content-type", "").split(";")[0]
        self._append_ext(extensions, mimetypes.guess_extension(content_type))
        # Read the content disposition if there is one
        content_disposition = response.headers.get("content-disposition", "")
        m = re.search(r"filename=([^;]+)", content_disposition)
        if m:
            base, ext = os.path.splitext(m.group(1).strip("\"'"))
            self._append_ext(extensions, ext)
        # Read from the extension from the path
        base, ext = os.path.splitext(urlparse(response.url).path)
        self._append_ext(extensions, ext)
        # Save the file locally to a temporary file. It will be deleted before this method exits
        handle, temp_path = tempfile.mkstemp()
        fh = os.fdopen(handle, "wb")
        result = None
        try:
            # Download the file
            for chunk in response.iter_content(chunk_size=512):
                fh.write(chunk)
            fh.close()
            # Use puremagic to check for more extension options
            for g in self._guess_ext_magic(temp_path):
                self._append_ext(extensions, g)
            # Convert
            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
            try:
                fh.close()
            except Exception:
                pass
            os.unlink(temp_path)
        return result
    def _convert(
        self, local_path: str, extensions: List[Union[str, None]], **kwargs
    ) -> DocumentConverterResult:
-        res: Union[None, DocumentConverterResult] = None
+        # If there is a content-type header, get the mimetype and charset (if present)
        mimetype: Optional[str] = None
        charset: Optional[str] = None
-        # Keep track of which converters throw exceptions
+        if "content-type" in response.headers:
-        failed_attempts: List[FailedConversionAttempt] = []
+            parts = response.headers["content-type"].split(";")
            mimetype = parts.pop(0).strip()
            for part in parts:
                if part.strip().startswith("charset="):
                    _charset = part.split("=")[1].strip()
                    if len(_charset) > 0:
                        charset = _charset
-        # Create a copy of the page_converters list, sorted by priority.
+        # If there is a content-disposition header, get the filename and possibly the extension
-        # We do this with each call to _convert because the priority of converters may change between calls.
+        filename: Optional[str] = None
-        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
+        extension: Optional[str] = None
-        sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
+        if "content-disposition" in response.headers:
            m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
            if m:
                filename = m.group(1).strip("\"'")
                _, _extension = os.path.splitext(filename)
                if len(_extension) > 0:
                    extension = _extension
-        for ext in extensions + [None]:  # Try last with no extension
+        # If there is still no filename, try to read it from the url
-            for converter in sorted_converters:
+        if filename is None:
-                _kwargs = copy.deepcopy(kwargs)
+            parsed_url = urlparse(response.url)
            _, _extension = os.path.splitext(parsed_url.path)
            if len(_extension) > 0:  # Looks like this might be a file!
                filename = os.path.basename(parsed_url.path)
                extension = _extension
-                # Overwrite file_extension appropriately
+        # Create an initial guess from all this information
-                if ext is None:
+        base_guess = StreamInfo(
-                    if "file_extension" in _kwargs:
+            mimetype=mimetype,
-                        del _kwargs["file_extension"]
+            charset=charset,
-                else:
+            filename=filename,
-                    _kwargs.update({"file_extension": ext})
+            extension=extension,
-
+            url=response.url,
                # Copy any additional global options
                if "llm_client" not in _kwargs and self._llm_client is not None:
                    _kwargs["llm_client"] = self._llm_client
                if "llm_model" not in _kwargs and self._llm_model is not None:
                    _kwargs["llm_model"] = self._llm_model
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map
                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
                    _kwargs["exiftool_path"] = self._exiftool_path
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._page_converters
                # If we hit an error log it and keep trying
                try:
                    res = converter.convert(local_path, **_kwargs)
                except Exception:
                    failed_attempts.append(
                        FailedConversionAttempt(
                            converter=converter, exc_info=sys.exc_info()
                        )
                    )
                if res is not None:
                    # Normalize the content
                    res.text_content = "\n".join(
                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
                    )
                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
                    # Todo
                    return res
        # If we got this far without success, report any exceptions
        if len(failed_attempts) > 0:
            raise FileConversionException(attempts=failed_attempts)
        # Nothing can handle it!
        raise UnsupportedFormatException(
            f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
        )
-    def _append_ext(self, extensions, ext):
+        # Update with any additional info from the arguments
-        """Append a unique non-None, non-empty extension to a list of extensions."""
+        if stream_info is not None:
-        if ext is None:
+            base_guess = base_guess.copy_and_update(stream_info)
-            return
+        if file_extension is not None:
-        ext = ext.strip()
+            # Deprecated -- use stream_info
-        if ext == "":
+            base_guess = base_guess.copy_and_update(extension=file_extension)
-            return
+        if url is not None:
-        if ext in extensions:
+            # Deprecated -- use stream_info
-            return
+            base_guess = base_guess.copy_and_update(url=url)
-        extensions.append(ext)
+
        # Add the guess if its non-trivial
        guesses: List[StreamInfo] = []
        if base_guess.mimetype is not None or base_guess.extension is not None:
            guesses.append(base_guess)
        # Read into BytesIO
        buffer = io.BytesIO()
        for chunk in response.iter_content(chunk_size=512):
            buffer.write(chunk)
        buffer.seek(0)
        # Create a placeholder filename to help with guessing
        placeholder_filename = None
        if base_guess.filename is not None:
            placeholder_filename = base_guess.filename
        elif base_guess.extension is not None:
            placeholder_filename = "placeholder" + base_guess.extension
        # Add guesses based on stream content
        for guess in StreamInfo.guess_from_stream(
            file_stream=buffer, filename_hint=placeholder_filename
        ):
            guesses.append(base_guess.copy_and_update(guess))
        # Convert
        return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
    def _convert(
        self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
    ) -> DocumentConverterResult:
        # Lazily create a temporary file, if needed, for backward compatibility
        # This is to support a deprecated feature, and will be removed in the future
        temp_file = None
        def get_temp_file():
            nonlocal temp_file
            if temp_file is not None:
                return temp_file
            else:
                cur_pos = file_stream.tell()
                handle, temp_file = tempfile.mkstemp()
                fh = os.fdopen(handle, "wb")
                file_stream.seek(0)
                fh.write(file_stream.read())
                file_stream.seek(cur_pos)
                fh.close()
            return temp_file
    def _guess_ext_magic(self, path):
        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
        # Use puremagic to guess
        try:
-            guesses = puremagic.magic_file(path)
+            res: Union[None, DocumentConverterResult] = None
-            # Fix for: https://github.com/microsoft/markitdown/issues/222
+            # Keep track of which converters throw exceptions
-            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
+            failed_attempts: List[FailedConversionAttempt] = []
            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
            # (space, tab, newline, carriage return, vertical tab, form feed).
            if len(guesses) == 0:
                with open(path, "rb") as file:
                    while True:
                        char = file.read(1)
                        if not char:  # End of file
                            break
                        if not char.isspace():
                            file.seek(file.tell() - 1)
                            break
                    try:
                        guesses = puremagic.magic_stream(file)
                    except puremagic.main.PureError:
                        pass
-            extensions = list()
+            # Create a copy of the page_converters list, sorted by priority.
-            for g in guesses:
+            # We do this with each call to _convert because the priority of converters may change between calls.
-                ext = g.extension.strip()
+            # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
-                if len(ext) > 0:
+            sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
-                    if not ext.startswith("."):
+
-                        ext = "." + ext
+            for file_info in stream_info_guesses + [None]:
-                    if ext not in extensions:
+                for converter in sorted_converters:
-                        extensions.append(ext)
+                    _kwargs = copy.deepcopy(kwargs)
-            return extensions
+
-        except FileNotFoundError:
+                    # Copy any additional global options
-            pass
+                    if "llm_client" not in _kwargs and self._llm_client is not None:
-        except IsADirectoryError:
+                        _kwargs["llm_client"] = self._llm_client
-            pass
+
-        except PermissionError:
+                    if "llm_model" not in _kwargs and self._llm_model is not None:
-            pass
+                        _kwargs["llm_model"] = self._llm_model
-        return []
+
                    if "style_map" not in _kwargs and self._style_map is not None:
                        _kwargs["style_map"] = self._style_map
                    if (
                        "exiftool_path" not in _kwargs
                        and self._exiftool_path is not None
                    ):
                        _kwargs["exiftool_path"] = self._exiftool_path
                    # Add the list of converters for nested processing
                    _kwargs["_parent_converters"] = self._page_converters
                    # Add backwards compatibility
                    if isinstance(converter, DocumentConverter):
                        if file_info is not None:
                            # Legacy converters need a file_extension
                            if file_info.extension is not None:
                                _kwargs["file_extension"] = file_info.extension
                            # And benefit from urls, when available
                            if file_info.url is not None:
                                _kwargs["url"] = file_info.url
                        try:
                            res = converter.convert(get_temp_file(), **_kwargs)
                        except Exception:
                            failed_attempts.append(
                                FailedConversionAttempt(
                                    converter=converter, exc_info=sys.exc_info()
                                )
                            )
                    else:
                        raise NotImplementedError("TODO")
                    if res is not None:
                        # Normalize the content
                        res.text_content = "\n".join(
                            [
                                line.rstrip()
                                for line in re.split(r"\r?\n", res.text_content)
                            ]
                        )
                        res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
                        return res
            # If we got this far without success, report any exceptions
            if len(failed_attempts) > 0:
                raise FileConversionException(attempts=failed_attempts)
            # Nothing can handle it!
            raise UnsupportedFormatException(
                f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
            )
        finally:
            # Clean up the temporary file
            if temp_file is not None:
                try:
                    os.unlink(temp_file)
                except Exception:
                    pass
    def register_page_converter(self, converter: DocumentConverter) -> None:
        """DEPRECATED: User register_converter instead."""
--- a/packages/markitdown/src/markitdown/_stream_info.py
+++ b/packages/markitdown/src/markitdown/_stream_info.py
@ -0,0 +1,105 @@
 import puremagic
 from dataclasses import dataclass, asdict
 from typing import Optional, BinaryIO, List, TypeVar, Type
 # This is a workaround for Self not being available in Python 3.10
 T = TypeVar("T", bound="StreamInfo")
 # Mimetype substitutions table
 MIMETYPE_SUBSTITUTIONS = {
    "application/excel": "application/vnd.ms-excel",
    "application/mspowerpoint": "application/vnd.ms-powerpoint",
 }
@dataclass(kw_only=True, frozen=True)
 class StreamInfo:
    """The StreamInfo class is used to store information about a file stream.
    All fields can be None, and will depend on how the stream was opened.
    """
    mimetype: Optional[str] = None
    extension: Optional[str] = None
    charset: Optional[str] = None
    filename: Optional[
        str
    ] = None  # From local path, url, or Content-Disposition header
    local_path: Optional[str] = None  # If read from disk
    url: Optional[str] = None  # If read from url
    def copy_and_update(self, *args, **kwargs):
        """Copy the StreamInfo object and update it with the given StreamInfo
        instance and/or other keyword arguments."""
        new_info = asdict(self)
        for si in args:
            assert isinstance(si, StreamInfo)
            new_info.update({k: v for k, v in asdict(si).items() if v is not None})
        if len(kwargs) > 0:
            new_info.update(kwargs)
        return StreamInfo(**new_info)
    @classmethod
    def guess_from_stream(
        cls: Type[T], file_stream: BinaryIO, *, filename_hint: Optional[str] = None
    ) -> List[T]:
        """
        Guess StreamInfo properties (mostly mimetype and extension) from a stream.
        Args:
        - stream: The stream to guess the StreamInfo from.
        - filename_hint [Optional]: A filename hint to help with the guessing (may be a placeholder, and not actually be the file name)
        Returns a list of StreamInfo objects in order of confidence.
        """
        guesses: List[StreamInfo] = []
        def _puremagic(
            file_stream, filename_hint
        ) -> puremagic.main.PureMagicWithConfidence:
            """Wrap guesses to handle exceptions."""
            try:
                return puremagic.magic_stream(file_stream, filename=filename_hint)
            except puremagic.main.PureError as e:
                return []
        cur_pos = file_stream.tell()
        type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
        if len(type_guesses) == 0:
            # Fix for: https://github.com/microsoft/markitdown/issues/222
            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
            # (space, tab, newline, carriage return, vertical tab, form feed).
            # Eat all the leading whitespace
            file_stream.seek(cur_pos)
            while True:
                char = file_stream.read(1)
                if not char:  # End of file
                    break
                if not char.isspace():
                    file_stream.seek(file_stream.tell() - 1)
                    break
            # Try again
            type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
        file_stream.seek(cur_pos)
        # Convert and return the guesses
        for guess in type_guesses:
            kwargs: dict[str, str] = {}
            if guess.extension:
                kwargs["extension"] = guess.extension
            if guess.mime_type:
                kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get(
                    guess.mime_type, guess.mime_type
                )
            if len(kwargs) > 0:
                # We don't add the filename_hint, because sometimes it's just a placeholder,
                # and, in any case, doesn't add new information.
                guesses.append(cls(**kwargs))
        # Return the guesses
        return guesses
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@ -8,7 +8,12 @@ import requests
 from warnings import catch_warnings, resetwarnings
-from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException
+from markitdown import (
    MarkItDown,
    UnsupportedFormatException,
    FileConversionException,
    StreamInfo,
 )
 skip_remote = (
    True if os.environ.get("GITHUB_ACTIONS") else False
@ -162,6 +167,107 @@ def validate_strings(result, expected_strings, exclude_strings=None):
            assert string not in text_content
 def test_stream_info_operations() -> None:
    """Test operations performed on StreamInfo objects."""
    stream_info_original = StreamInfo(
        mimetype="mimetype.1",
        extension="extension.1",
        charset="charset.1",
        filename="filename.1",
        local_path="local_path.1",
        url="url.1",
    )
    # Check updating all attributes by keyword
    keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
    for keyword in keywords:
        updated_stream_info = stream_info_original.copy_and_update(
            **{keyword: f"{keyword}.2"}
        )
        # Make sure the targted attribute is updated
        assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
        # Make sure the other attributes are unchanged
        for k in keywords:
            if k != keyword:
                assert getattr(stream_info_original, k) == getattr(
                    updated_stream_info, k
                )
    # Check updating all attributes by passing a new StreamInfo object
    keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
    for keyword in keywords:
        updated_stream_info = stream_info_original.copy_and_update(
            StreamInfo(**{keyword: f"{keyword}.2"})
        )
        # Make sure the targted attribute is updated
        assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
        # Make sure the other attributes are unchanged
        for k in keywords:
            if k != keyword:
                assert getattr(stream_info_original, k) == getattr(
                    updated_stream_info, k
                )
    # Check mixing and matching
    updated_stream_info = stream_info_original.copy_and_update(
        StreamInfo(extension="extension.2", filename="filename.2"),
        mimetype="mimetype.3",
        charset="charset.3",
    )
    assert updated_stream_info.extension == "extension.2"
    assert updated_stream_info.filename == "filename.2"
    assert updated_stream_info.mimetype == "mimetype.3"
    assert updated_stream_info.charset == "charset.3"
    assert updated_stream_info.local_path == "local_path.1"
    assert updated_stream_info.url == "url.1"
    # Check multiple StreamInfo objects
    updated_stream_info = stream_info_original.copy_and_update(
        StreamInfo(extension="extension.4", filename="filename.5"),
        StreamInfo(mimetype="mimetype.6", charset="charset.7"),
    )
    assert updated_stream_info.extension == "extension.4"
    assert updated_stream_info.filename == "filename.5"
    assert updated_stream_info.mimetype == "mimetype.6"
    assert updated_stream_info.charset == "charset.7"
    assert updated_stream_info.local_path == "local_path.1"
    assert updated_stream_info.url == "url.1"
 def test_stream_info_guesses() -> None:
    """Test StreamInfo guesses based on stream content."""
    test_tuples = [
        (
            os.path.join(TEST_FILES_DIR, "test.xlsx"),
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        ),
        (
            os.path.join(TEST_FILES_DIR, "test.docx"),
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        ),
        (
            os.path.join(TEST_FILES_DIR, "test.pptx"),
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ),
        (os.path.join(TEST_FILES_DIR, "test.xls"), "application/vnd.ms-excel"),
    ]
    for file_path, expected_mimetype in test_tuples:
        with open(file_path, "rb") as f:
            guesses = StreamInfo.guess_from_stream(
                f, filename_hint=os.path.basename(file_path)
            )
            assert len(guesses) > 0
            assert guesses[0].mimetype == expected_mimetype
            assert guesses[0].extension == os.path.splitext(file_path)[1]
@pytest.mark.skipif(
    skip_remote,
    reason="do not run tests that query external urls",
@ -266,6 +372,11 @@ def test_markitdown_local() -> None:
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
    validate_strings(result, JSON_TEST_STRINGS)
    # Test input from a stream
    input_data = b"<html><body><h1>Test</h1></body></html>"
    result = markitdown.convert_stream(io.BytesIO(input_data))
    assert "# Test" in result.text_content
    # Test input with leading blank characters
    input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
    result = markitdown.convert_stream(io.BytesIO(input_data))
@ -342,9 +453,11 @@ def test_markitdown_llm() -> None:
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    test_stream_info_operations()
    test_stream_info_guesses()
    test_markitdown_remote()
    test_markitdown_local()
-    test_exceptions()
+    # test_exceptions()
-    test_markitdown_exiftool()
+    # test_markitdown_exiftool()
    # test_markitdown_llm()
    print("All tests passed!")