From 84f8198d8a5871cf692ce22c98a646a1085ea622 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Wed, 5 Mar 2025 16:41:15 -0800 Subject: [PATCH] Fixed many mypy errors. --- .../markitdown/src/markitdown/_markitdown.py | 24 +++++--- .../markitdown/src/markitdown/_stream_info.py | 2 +- .../converters/_doc_intel_converter.py | 6 +- .../markitdown/converters/_docx_converter.py | 6 +- .../src/markitdown/converters/_exiftool.py | 4 +- .../markitdown/converters/_image_converter.py | 24 ++++---- .../markitdown/converters/_ipynb_converter.py | 59 ++++++++++++++----- .../src/markitdown/converters/_llm_caption.py | 2 +- .../converters/_outlook_msg_converter.py | 6 +- .../markitdown/converters/_pdf_converter.py | 9 ++- .../markitdown/converters/_pptx_converter.py | 6 +- .../converters/_transcribe_audio.py | 6 +- .../markitdown/converters/_xlsx_converter.py | 12 ++-- .../markitdown/converters/_zip_converter.py | 2 +- packages/markitdown/tests/test_cli.py | 2 +- 15 files changed, 115 insertions(+), 55 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 104db11..a51f227 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -50,10 +50,10 @@ from ._exceptions import ( # Override mimetype for csv to fix issue on windows mimetypes.add_type("text/csv", ".csv") -_plugins: Union[None | List[Any]] = None +_plugins: List[Any] = [] -def _load_plugins() -> Union[None | List[Any]]: +def _load_plugins() -> List[Any]: """Lazy load plugins, exiting early if already loaded.""" global _plugins @@ -94,10 +94,10 @@ class MarkItDown: self._requests_session = requests_session # TODO - remove these (see enable_builtins) - self._llm_client = None - self._llm_model = None - self._exiftool_path = None - self._style_map = None + self._llm_client: Any = None + self._llm_model: Union[str | None] = None + self._exiftool_path: Union[str | None] = None + self._style_map: Union[str | None] = None # Register the converters self._converters: List[DocumentConverter] = [] @@ -272,12 +272,20 @@ class MarkItDown: # Do we have anything on which to base a guess? base_guess = None if stream_info is not None or file_extension is not None or url is not None: - base_guess = stream_info if stream_info is not None else StreamInfo() + # Start with a non-Null base guess + if stream_info is None: + base_guess = StreamInfo() + else: + base_guess = stream_info + if file_extension is not None: # Deprecated -- use stream_info + assert base_guess is not None # for mypy base_guess = base_guess.copy_and_update(extension=file_extension) + if url is not None: # Deprecated -- use stream_info + assert base_guess is not None # for mypy base_guess = base_guess.copy_and_update(url=url) # Append the base guess, if it's non-trivial @@ -498,6 +506,6 @@ class MarkItDown: ) self.register_converter(converter) - def register_converter(self, converter: Union[DocumentConverter]) -> None: + def register_converter(self, converter: DocumentConverter) -> None: """Register a page text converter.""" self._converters.insert(0, converter) diff --git a/packages/markitdown/src/markitdown/_stream_info.py b/packages/markitdown/src/markitdown/_stream_info.py index 32a51ef..1eaa4d2 100644 --- a/packages/markitdown/src/markitdown/_stream_info.py +++ b/packages/markitdown/src/markitdown/_stream_info.py @@ -76,7 +76,7 @@ def _guess_stream_info_from_stream( def _puremagic( file_stream, filename_hint - ) -> puremagic.main.PureMagicWithConfidence: + ) -> List[puremagic.main.PureMagicWithConfidence]: """Wrap guesses to handle exceptions.""" try: return puremagic.magic_stream(file_stream, filename=filename_hint) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index a71ceae..00ab0fc 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -81,9 +81,11 @@ class DocumentIntelligenceConverter(DocumentConverter): if _dependency_exc_info is not None: raise MissingDependencyException( "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`" - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) self.endpoint = endpoint self.api_version = api_version diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index c2c643b..a5090ac 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -67,9 +67,11 @@ class DocxConverter(HtmlConverter): extension=".docx", feature="docx", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) style_map = kwargs.get("style_map", None) return self._html_converter.convert_string( diff --git a/packages/markitdown/src/markitdown/converters/_exiftool.py b/packages/markitdown/src/markitdown/converters/_exiftool.py index b492801..5a316f0 100644 --- a/packages/markitdown/src/markitdown/converters/_exiftool.py +++ b/packages/markitdown/src/markitdown/converters/_exiftool.py @@ -5,12 +5,12 @@ import sys import shutil import os import warnings -from typing import BinaryIO, Literal, Optional +from typing import BinaryIO, Optional, Any def exiftool_metadata( file_stream: BinaryIO, *, exiftool_path: Optional[str] = None -) -> dict[str, Literal]: +) -> Any: # Need a better type for json data # Check if we have a valid pointer to exiftool if not exiftool_path: which_exiftool = shutil.which("exiftool") diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index d0d7e66..e03dfe8 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -4,7 +4,6 @@ import mimetypes from ._exiftool import exiftool_metadata from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE ACCEPTED_MIME_TYPE_PREFIXES = [ "image/jpeg", @@ -75,18 +74,17 @@ class ImageConverter(DocumentConverter): llm_client = kwargs.get("llm_client") llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: - md_content += ( - "\n# Description:\n" - + self._get_llm_description( - file_stream, - stream_info, - client=llm_client, - model=llm_model, - prompt=kwargs.get("llm_prompt"), - ).strip() - + "\n" + llm_description = self._get_llm_description( + file_stream, + stream_info, + client=llm_client, + model=llm_model, + prompt=kwargs.get("llm_prompt"), ) + if llm_description is not None: + md_content += "\n# Description:\n" + llm_description.strip() + "\n" + return DocumentConverterResult( markdown=md_content, ) @@ -106,7 +104,9 @@ class ImageConverter(DocumentConverter): # Get the content type content_type = stream_info.mimetype if not content_type: - content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension) + content_type, _ = mimetypes.guess_type( + "_dummy" + (stream_info.extension or "") + ) if not content_type: content_type = "application/octet-stream" diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index 2c5cb3f..490e4e1 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -1,9 +1,15 @@ +from typing import BinaryIO, Any import json -from typing import Any, Union from .._base_converter import DocumentConverter, DocumentConverterResult - from .._exceptions import FileConversionException +from .._stream_info import StreamInfo + +CANDIDATE_MIME_TYPE_PREFIXES = [ + "application/json", +] + +ACCEPTED_FILE_EXTENSIONS = [".ipynb"] class IpynbConverter(DocumentConverter): @@ -14,23 +20,48 @@ class IpynbConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not ipynb - extension = kwargs.get("file_extension", "") - if extension.lower() != ".ipynb": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in CANDIDATE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + # Read further to see if it's a notebook + cur_pos = file_stream.tell() + try: + encoding = stream_info.charset or "utf-8" + notebook_content = file_stream.read().decode(encoding) + return ( + "nbformat" in notebook_content + and "nbformat_minor" in notebook_content + ) + finally: + file_stream.seek(cur_pos) + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Parse and convert the notebook result = None - with open(local_path, "rt", encoding="utf-8") as fh: - notebook_content = json.load(fh) - result = self._convert(notebook_content) - return result + encoding = stream_info.charset or "utf-8" + notebook_content = file_stream.read().decode(encoding=encoding) + return self._convert(json.loads(notebook_content)) - def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]: + def _convert(self, notebook_content: dict) -> DocumentConverterResult: """Helper function that converts notebook JSON content to Markdown.""" try: md_output = [] diff --git a/packages/markitdown/src/markitdown/converters/_llm_caption.py b/packages/markitdown/src/markitdown/converters/_llm_caption.py index 44756e0..b851dc8 100644 --- a/packages/markitdown/src/markitdown/converters/_llm_caption.py +++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py @@ -13,7 +13,7 @@ def llm_caption( # Get the content type content_type = stream_info.mimetype if not content_type: - content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension) + content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or "")) if not content_type: content_type = "application/octet-stream" diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index 3da5fbd..cef3dc7 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -87,9 +87,11 @@ class OutlookMsgConverter(DocumentConverter): extension=".msg", feature="outlook", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) msg = olefile.OleFileIO(file_stream) # Extract email metadata diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 48fc499..445dba3 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,7 +1,9 @@ import sys +import io from typing import BinaryIO, Any + from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo @@ -69,10 +71,13 @@ class PdfConverter(DocumentConverter): extension=".pdf", feature="pdf", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) + assert isinstance(file_stream, io.IOBase) # for mypy return DocumentConverterResult( markdown=pdfminer.high_level.extract_text(file_stream), ) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 2cbd158..e51739e 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -73,9 +73,11 @@ class PptxConverter(DocumentConverter): extension=".pptx", feature="pptx", ) - ) from _dependency_exc_info[1].with_traceback( + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] - ) # Restore the original traceback + ) # Perform the conversion presentation = pptx.Presentation(file_stream) diff --git a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py index cd212ba..3d02173 100644 --- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py +++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py @@ -19,7 +19,11 @@ def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str if _dependency_exc_info is not None: raise MissingDependencyException( "Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`" - ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) if audio_format in ["wav", "aiff", "flac"]: audio_source = file_stream diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index e306b48..f11af31 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -76,9 +76,11 @@ class XlsxConverter(DocumentConverter): extension=".xlsx", feature="xlsx", ) - ) from _xlsx_dependency_exc_info[1].with_traceback( + ) from _xlsx_dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _xlsx_dependency_exc_info[2] - ) # Restore the original traceback + ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") md_content = "" @@ -136,9 +138,11 @@ class XlsConverter(DocumentConverter): extension=".xls", feature="xls", ) - ) from _xls_dependency_exc_info[1].with_traceback( + ) from _xls_dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] _xls_dependency_exc_info[2] - ) # Restore the original traceback + ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") md_content = "" diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 7afe7ba..c60d94a 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -92,7 +92,7 @@ class ZipConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - file_path = stream_info.url or stream_info.local_path or stream_info.file_name + file_path = stream_info.url or stream_info.local_path or stream_info.filename md_content = f"Content from the zip file `{file_path}`:\n\n" with zipfile.ZipFile(file_stream, "r") as zipObj: diff --git a/packages/markitdown/tests/test_cli.py b/packages/markitdown/tests/test_cli.py index 1e2b095..7c8afc2 100644 --- a/packages/markitdown/tests/test_cli.py +++ b/packages/markitdown/tests/test_cli.py @@ -7,7 +7,7 @@ from markitdown import __version__ try: from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS except ImportError: - from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS + from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS # type: ignore @pytest.fixture(scope="session")