Fixed many mypy errors.

2025-03-05 16:41:15 -08:00 · 2025-03-05 16:41:15 -08:00 · 84f8198d8a
commit 84f8198d8a
parent aa94bce6d9
15 changed files with 115 additions and 55 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -50,10 +50,10 @@ from ._exceptions import (
 # Override mimetype for csv to fix issue on windows
 mimetypes.add_type("text/csv", ".csv")

-_plugins: Union[None | List[Any]] = None
+_plugins: List[Any] = []


-def _load_plugins() -> Union[None | List[Any]]:
+def _load_plugins() -> List[Any]:
    """Lazy load plugins, exiting early if already loaded."""
    global _plugins

@ -94,10 +94,10 @@ class MarkItDown:
            self._requests_session = requests_session

        # TODO - remove these (see enable_builtins)
-        self._llm_client = None
-        self._llm_model = None
-        self._exiftool_path = None
-        self._style_map = None
+        self._llm_client: Any = None
+        self._llm_model: Union[str | None] = None
+        self._exiftool_path: Union[str | None] = None
+        self._style_map: Union[str | None] = None

        # Register the converters
        self._converters: List[DocumentConverter] = []
@ -272,12 +272,20 @@ class MarkItDown:
        # Do we have anything on which to base a guess?
        base_guess = None
        if stream_info is not None or file_extension is not None or url is not None:
-            base_guess = stream_info if stream_info is not None else StreamInfo()
+            # Start with a non-Null base guess
+            if stream_info is None:
+                base_guess = StreamInfo()
+            else:
+                base_guess = stream_info
+
            if file_extension is not None:
                # Deprecated -- use stream_info
+                assert base_guess is not None  # for mypy
                base_guess = base_guess.copy_and_update(extension=file_extension)
+
            if url is not None:
                # Deprecated -- use stream_info
+                assert base_guess is not None  # for mypy
                base_guess = base_guess.copy_and_update(url=url)

        # Append the base guess, if it's non-trivial
@ -498,6 +506,6 @@ class MarkItDown:
        )
        self.register_converter(converter)

-    def register_converter(self, converter: Union[DocumentConverter]) -> None:
+    def register_converter(self, converter: DocumentConverter) -> None:
        """Register a page text converter."""
        self._converters.insert(0, converter)
--- a/packages/markitdown/src/markitdown/_stream_info.py
+++ b/packages/markitdown/src/markitdown/_stream_info.py
@ -76,7 +76,7 @@ def _guess_stream_info_from_stream(

    def _puremagic(
        file_stream, filename_hint
-    ) -> puremagic.main.PureMagicWithConfidence:
+    ) -> List[puremagic.main.PureMagicWithConfidence]:
        """Wrap guesses to handle exceptions."""
        try:
            return puremagic.magic_stream(file_stream, filename=filename_hint)
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -81,9 +81,11 @@ class DocumentIntelligenceConverter(DocumentConverter):
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
                "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )

        self.endpoint = endpoint
        self.api_version = api_version
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -67,9 +67,11 @@ class DocxConverter(HtmlConverter):
                    extension=".docx",
                    feature="docx",
                )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )

        style_map = kwargs.get("style_map", None)
        return self._html_converter.convert_string(
--- a/packages/markitdown/src/markitdown/converters/_exiftool.py
+++ b/packages/markitdown/src/markitdown/converters/_exiftool.py
@ -5,12 +5,12 @@ import sys
 import shutil
 import os
 import warnings
-from typing import BinaryIO, Literal, Optional
+from typing import BinaryIO, Optional, Any


 def exiftool_metadata(
    file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
-) -> dict[str, Literal]:
+) -> Any:  # Need a better type for json data
    # Check if we have a valid pointer to exiftool
    if not exiftool_path:
        which_exiftool = shutil.which("exiftool")
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -4,7 +4,6 @@ import mimetypes
 from ._exiftool import exiftool_metadata
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 ACCEPTED_MIME_TYPE_PREFIXES = [
    "image/jpeg",
@ -75,18 +74,17 @@ class ImageConverter(DocumentConverter):
        llm_client = kwargs.get("llm_client")
        llm_model = kwargs.get("llm_model")
        if llm_client is not None and llm_model is not None:
-            md_content += (
-                "\n# Description:\n"
-                + self._get_llm_description(
+            llm_description = self._get_llm_description(
                file_stream,
                stream_info,
                client=llm_client,
                model=llm_model,
                prompt=kwargs.get("llm_prompt"),
-                ).strip()
-                + "\n"
            )

+            if llm_description is not None:
+                md_content += "\n# Description:\n" + llm_description.strip() + "\n"
+
        return DocumentConverterResult(
            markdown=md_content,
        )
@ -106,7 +104,9 @@ class ImageConverter(DocumentConverter):
        # Get the content type
        content_type = stream_info.mimetype
        if not content_type:
-            content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
+            content_type, _ = mimetypes.guess_type(
+                "_dummy" + (stream_info.extension or "")
+            )
        if not content_type:
            content_type = "application/octet-stream"

--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -1,9 +1,15 @@
+from typing import BinaryIO, Any
 import json
-from typing import Any, Union

 from .._base_converter import DocumentConverter, DocumentConverterResult
-
 from .._exceptions import FileConversionException
+from .._stream_info import StreamInfo
+
+CANDIDATE_MIME_TYPE_PREFIXES = [
+    "application/json",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".ipynb"]


 class IpynbConverter(DocumentConverter):
@ -14,23 +20,48 @@ class IpynbConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)

-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not ipynb
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".ipynb":
-            return None
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()

+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                # Read further to see if it's a notebook
+                cur_pos = file_stream.tell()
+                try:
+                    encoding = stream_info.charset or "utf-8"
+                    notebook_content = file_stream.read().decode(encoding)
+                    return (
+                        "nbformat" in notebook_content
+                        and "nbformat_minor" in notebook_content
+                    )
+                finally:
+                    file_stream.seek(cur_pos)
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
        # Parse and convert the notebook
        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            notebook_content = json.load(fh)
-            result = self._convert(notebook_content)

-        return result
+        encoding = stream_info.charset or "utf-8"
+        notebook_content = file_stream.read().decode(encoding=encoding)
+        return self._convert(json.loads(notebook_content))

-    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
+    def _convert(self, notebook_content: dict) -> DocumentConverterResult:
        """Helper function that converts notebook JSON content to Markdown."""
        try:
            md_output = []
--- a/packages/markitdown/src/markitdown/converters/_llm_caption.py
+++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py
@ -13,7 +13,7 @@ def llm_caption(
    # Get the content type
    content_type = stream_info.mimetype
    if not content_type:
-        content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
+        content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
    if not content_type:
        content_type = "application/octet-stream"

--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -87,9 +87,11 @@ class OutlookMsgConverter(DocumentConverter):
                    extension=".msg",
                    feature="outlook",
                )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )

        msg = olefile.OleFileIO(file_stream)
        # Extract email metadata
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -1,7 +1,9 @@
 import sys
+import io

 from typing import BinaryIO, Any

+
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
@ -69,10 +71,13 @@ class PdfConverter(DocumentConverter):
                    extension=".pdf",
                    feature="pdf",
                )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )

+        assert isinstance(file_stream, io.IOBase)  # for mypy
        return DocumentConverterResult(
            markdown=pdfminer.high_level.extract_text(file_stream),
        )
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -73,9 +73,11 @@ class PptxConverter(DocumentConverter):
                    extension=".pptx",
                    feature="pptx",
                )
-            ) from _dependency_exc_info[1].with_traceback(
+            ) from _dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                _dependency_exc_info[2]
-            )  # Restore the original traceback
+            )

        # Perform the conversion
        presentation = pptx.Presentation(file_stream)
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@ -19,7 +19,11 @@ def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str
    if _dependency_exc_info is not None:
        raise MissingDependencyException(
            "Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
-        ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
+        ) from _dependency_exc_info[
+            1
+        ].with_traceback(  # type: ignore[union-attr]
+            _dependency_exc_info[2]
+        )

    if audio_format in ["wav", "aiff", "flac"]:
        audio_source = file_stream
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -76,9 +76,11 @@ class XlsxConverter(DocumentConverter):
                    extension=".xlsx",
                    feature="xlsx",
                )
-            ) from _xlsx_dependency_exc_info[1].with_traceback(
+            ) from _xlsx_dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                _xlsx_dependency_exc_info[2]
-            )  # Restore the original traceback
+            )

        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
        md_content = ""
@ -136,9 +138,11 @@ class XlsConverter(DocumentConverter):
                    extension=".xls",
                    feature="xls",
                )
-            ) from _xls_dependency_exc_info[1].with_traceback(
+            ) from _xls_dependency_exc_info[
+                1
+            ].with_traceback(  # type: ignore[union-attr]
                _xls_dependency_exc_info[2]
-            )  # Restore the original traceback
+            )

        sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
        md_content = ""
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -92,7 +92,7 @@ class ZipConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        file_path = stream_info.url or stream_info.local_path or stream_info.file_name
+        file_path = stream_info.url or stream_info.local_path or stream_info.filename
        md_content = f"Content from the zip file `{file_path}`:\n\n"

        with zipfile.ZipFile(file_stream, "r") as zipObj:
--- a/packages/markitdown/tests/test_cli.py
+++ b/packages/markitdown/tests/test_cli.py
@ -7,7 +7,7 @@ from markitdown import __version__
 try:
    from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
 except ImportError:
-    from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
+    from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS  # type: ignore


@pytest.fixture(scope="session")