From 84f8198d8a5871cf692ce22c98a646a1085ea622 Mon Sep 17 00:00:00 2001
From: Adam Fourney <adamfo@microsoft.com>
Date: Wed, 5 Mar 2025 16:41:15 -0800
Subject: [PATCH] Fixed many mypy errors.

---
 .../markitdown/src/markitdown/_markitdown.py  | 24 +++++---
 .../markitdown/src/markitdown/_stream_info.py |  2 +-
 .../converters/_doc_intel_converter.py        |  6 +-
 .../markitdown/converters/_docx_converter.py  |  6 +-
 .../src/markitdown/converters/_exiftool.py    |  4 +-
 .../markitdown/converters/_image_converter.py | 24 ++++----
 .../markitdown/converters/_ipynb_converter.py | 59 ++++++++++++++-----
 .../src/markitdown/converters/_llm_caption.py |  2 +-
 .../converters/_outlook_msg_converter.py      |  6 +-
 .../markitdown/converters/_pdf_converter.py   |  9 ++-
 .../markitdown/converters/_pptx_converter.py  |  6 +-
 .../converters/_transcribe_audio.py           |  6 +-
 .../markitdown/converters/_xlsx_converter.py  | 12 ++--
 .../markitdown/converters/_zip_converter.py   |  2 +-
 packages/markitdown/tests/test_cli.py         |  2 +-
 15 files changed, 115 insertions(+), 55 deletions(-)

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index 104db11..a51f227 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -50,10 +50,10 @@ from ._exceptions import (
 # Override mimetype for csv to fix issue on windows
 mimetypes.add_type("text/csv", ".csv")
 
-_plugins: Union[None | List[Any]] = None
+_plugins: List[Any] = []
 
 
-def _load_plugins() -> Union[None | List[Any]]:
+def _load_plugins() -> List[Any]:
     """Lazy load plugins, exiting early if already loaded."""
     global _plugins
 
@@ -94,10 +94,10 @@ class MarkItDown:
             self._requests_session = requests_session
 
         # TODO - remove these (see enable_builtins)
-        self._llm_client = None
-        self._llm_model = None
-        self._exiftool_path = None
-        self._style_map = None
+        self._llm_client: Any = None
+        self._llm_model: Union[str | None] = None
+        self._exiftool_path: Union[str | None] = None
+        self._style_map: Union[str | None] = None
 
         # Register the converters
         self._converters: List[DocumentConverter] = []
@@ -272,12 +272,20 @@ class MarkItDown:
         # Do we have anything on which to base a guess?
         base_guess = None
         if stream_info is not None or file_extension is not None or url is not None:
-            base_guess = stream_info if stream_info is not None else StreamInfo()
+            # Start with a non-Null base guess
+            if stream_info is None:
+                base_guess = StreamInfo()
+            else:
+                base_guess = stream_info
+
             if file_extension is not None:
                 # Deprecated -- use stream_info
+                assert base_guess is not None  # for mypy
                 base_guess = base_guess.copy_and_update(extension=file_extension)
+
             if url is not None:
                 # Deprecated -- use stream_info
+                assert base_guess is not None  # for mypy
                 base_guess = base_guess.copy_and_update(url=url)
 
         # Append the base guess, if it's non-trivial
@@ -498,6 +506,6 @@ class MarkItDown:
         )
         self.register_converter(converter)
 
-    def register_converter(self, converter: Union[DocumentConverter]) -> None:
+    def register_converter(self, converter: DocumentConverter) -> None:
         """Register a page text converter."""
         self._converters.insert(0, converter)
diff --git a/packages/markitdown/src/markitdown/_stream_info.py b/packages/markitdown/src/markitdown/_stream_info.py
index 32a51ef..1eaa4d2 100644
--- a/packages/markitdown/src/markitdown/_stream_info.py
+++ b/packages/markitdown/src/markitdown/_stream_info.py
@@ -76,7 +76,7 @@ def _guess_stream_info_from_stream(
 
     def _puremagic(
         file_stream, filename_hint
-    ) -> puremagic.main.PureMagicWithConfidence:
+    ) -> List[puremagic.main.PureMagicWithConfidence]:
         """Wrap guesses to handle exceptions."""
         try:
             return puremagic.magic_stream(file_stream, filename=filename_hint)
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
index a71ceae..00ab0fc 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -81,9 +81,11 @@ class DocumentIntelligenceConverter(DocumentConverter):
         if _dependency_exc_info is not None:
             raise MissingDependencyException(
                 "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )
 
         self.endpoint = endpoint
         self.api_version = api_version
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index c2c643b..a5090ac 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -67,9 +67,11 @@ class DocxConverter(HtmlConverter):
                     extension=".docx",
                     feature="docx",
                 )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )
 
         style_map = kwargs.get("style_map", None)
         return self._html_converter.convert_string(
diff --git a/packages/markitdown/src/markitdown/converters/_exiftool.py b/packages/markitdown/src/markitdown/converters/_exiftool.py
index b492801..5a316f0 100644
--- a/packages/markitdown/src/markitdown/converters/_exiftool.py
+++ b/packages/markitdown/src/markitdown/converters/_exiftool.py
@@ -5,12 +5,12 @@ import sys
 import shutil
 import os
 import warnings
-from typing import BinaryIO, Literal, Optional
+from typing import BinaryIO, Optional, Any
 
 
 def exiftool_metadata(
     file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
-) -> dict[str, Literal]:
+) -> Any:  # Need a better type for json data
     # Check if we have a valid pointer to exiftool
     if not exiftool_path:
         which_exiftool = shutil.which("exiftool")
diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py
index d0d7e66..e03dfe8 100644
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -4,7 +4,6 @@ import mimetypes
 from ._exiftool import exiftool_metadata
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
     "image/jpeg",
@@ -75,18 +74,17 @@ class ImageConverter(DocumentConverter):
         llm_client = kwargs.get("llm_client")
         llm_model = kwargs.get("llm_model")
         if llm_client is not None and llm_model is not None:
-            md_content += (
-                "\n# Description:\n"
-                + self._get_llm_description(
-                    file_stream,
-                    stream_info,
-                    client=llm_client,
-                    model=llm_model,
-                    prompt=kwargs.get("llm_prompt"),
-                ).strip()
-                + "\n"
+            llm_description = self._get_llm_description(
+                file_stream,
+                stream_info,
+                client=llm_client,
+                model=llm_model,
+                prompt=kwargs.get("llm_prompt"),
             )
 
+            if llm_description is not None:
+                md_content += "\n# Description:\n" + llm_description.strip() + "\n"
+
         return DocumentConverterResult(
             markdown=md_content,
         )
@@ -106,7 +104,9 @@ class ImageConverter(DocumentConverter):
         # Get the content type
         content_type = stream_info.mimetype
         if not content_type:
-            content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
+            content_type, _ = mimetypes.guess_type(
+                "_dummy" + (stream_info.extension or "")
+            )
         if not content_type:
             content_type = "application/octet-stream"
 
diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
index 2c5cb3f..490e4e1 100644
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -1,9 +1,15 @@
+from typing import BinaryIO, Any
 import json
-from typing import Any, Union
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
-
 from .._exceptions import FileConversionException
+from .._stream_info import StreamInfo
+
+CANDIDATE_MIME_TYPE_PREFIXES = [
+    "application/json",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
 
 
 class IpynbConverter(DocumentConverter):
@@ -14,23 +20,48 @@ class IpynbConverter(DocumentConverter):
     ):
         super().__init__(priority=priority)
 
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not ipynb
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".ipynb":
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
 
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                # Read further to see if it's a notebook
+                cur_pos = file_stream.tell()
+                try:
+                    encoding = stream_info.charset or "utf-8"
+                    notebook_content = file_stream.read().decode(encoding)
+                    return (
+                        "nbformat" in notebook_content
+                        and "nbformat_minor" in notebook_content
+                    )
+                finally:
+                    file_stream.seek(cur_pos)
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
         # Parse and convert the notebook
         result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            notebook_content = json.load(fh)
-            result = self._convert(notebook_content)
 
-        return result
+        encoding = stream_info.charset or "utf-8"
+        notebook_content = file_stream.read().decode(encoding=encoding)
+        return self._convert(json.loads(notebook_content))
 
-    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
+    def _convert(self, notebook_content: dict) -> DocumentConverterResult:
         """Helper function that converts notebook JSON content to Markdown."""
         try:
             md_output = []
diff --git a/packages/markitdown/src/markitdown/converters/_llm_caption.py b/packages/markitdown/src/markitdown/converters/_llm_caption.py
index 44756e0..b851dc8 100644
--- a/packages/markitdown/src/markitdown/converters/_llm_caption.py
+++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py
@@ -13,7 +13,7 @@ def llm_caption(
     # Get the content type
     content_type = stream_info.mimetype
     if not content_type:
-        content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
+        content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
     if not content_type:
         content_type = "application/octet-stream"
 
diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
index 3da5fbd..cef3dc7 100644
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -87,9 +87,11 @@ class OutlookMsgConverter(DocumentConverter):
                     extension=".msg",
                     feature="outlook",
                 )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )
 
         msg = olefile.OleFileIO(file_stream)
         # Extract email metadata
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index 48fc499..445dba3 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,7 +1,9 @@
 import sys
+import io
 
 from typing import BinaryIO, Any
 
+
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
@@ -69,10 +71,13 @@ class PdfConverter(DocumentConverter):
                     extension=".pdf",
                     feature="pdf",
                 )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )
 
+        assert isinstance(file_stream, io.IOBase)  # for mypy
         return DocumentConverterResult(
             markdown=pdfminer.high_level.extract_text(file_stream),
         )
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index 2cbd158..e51739e 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -73,9 +73,11 @@ class PptxConverter(DocumentConverter):
                     extension=".pptx",
                     feature="pptx",
                 )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )
 
         # Perform the conversion
         presentation = pptx.Presentation(file_stream)
diff --git a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
index cd212ba..3d02173 100644
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -19,7 +19,11 @@ def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str
     if _dependency_exc_info is not None:
         raise MissingDependencyException(
             "Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
-        ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
+        ) from _dependency_exc_info[
+            1
+        ].with_traceback(  # type: ignore[union-attr]
+            _dependency_exc_info[2]
+        )
 
     if audio_format in ["wav", "aiff", "flac"]:
         audio_source = file_stream
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index e306b48..f11af31 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -76,9 +76,11 @@ class XlsxConverter(DocumentConverter):
                     extension=".xlsx",
                     feature="xlsx",
                 )
-            ) from _xlsx_dependency_exc_info[1].with_traceback(
+            ) from _xlsx_dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _xlsx_dependency_exc_info[2]
-            )  # Restore the original traceback
+            )
 
         sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
         md_content = ""
@@ -136,9 +138,11 @@ class XlsConverter(DocumentConverter):
                     extension=".xls",
                     feature="xls",
                 )
-            ) from _xls_dependency_exc_info[1].with_traceback(
+            ) from _xls_dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                 _xls_dependency_exc_info[2]
-            )  # Restore the original traceback
+            )
 
         sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
         md_content = ""
diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py
index 7afe7ba..c60d94a 100644
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -92,7 +92,7 @@ class ZipConverter(DocumentConverter):
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
-        file_path = stream_info.url or stream_info.local_path or stream_info.file_name
+        file_path = stream_info.url or stream_info.local_path or stream_info.filename
         md_content = f"Content from the zip file `{file_path}`:\n\n"
 
         with zipfile.ZipFile(file_stream, "r") as zipObj:
diff --git a/packages/markitdown/tests/test_cli.py b/packages/markitdown/tests/test_cli.py
index 1e2b095..7c8afc2 100644
--- a/packages/markitdown/tests/test_cli.py
+++ b/packages/markitdown/tests/test_cli.py
@@ -7,7 +7,7 @@ from markitdown import __version__
 try:
     from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
 except ImportError:
-    from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
+    from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS  # type: ignore
 
 
 @pytest.fixture(scope="session")