Fixed many mypy errors.

This commit is contained in:
Adam Fourney 2025-03-05 16:41:15 -08:00
parent aa94bce6d9
commit 84f8198d8a
15 changed files with 115 additions and 55 deletions

View file

@ -50,10 +50,10 @@ from ._exceptions import (
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")
_plugins: Union[None | List[Any]] = None
_plugins: List[Any] = []
def _load_plugins() -> Union[None | List[Any]]:
def _load_plugins() -> List[Any]:
"""Lazy load plugins, exiting early if already loaded."""
global _plugins
@ -94,10 +94,10 @@ class MarkItDown:
self._requests_session = requests_session
# TODO - remove these (see enable_builtins)
self._llm_client = None
self._llm_model = None
self._exiftool_path = None
self._style_map = None
self._llm_client: Any = None
self._llm_model: Union[str | None] = None
self._exiftool_path: Union[str | None] = None
self._style_map: Union[str | None] = None
# Register the converters
self._converters: List[DocumentConverter] = []
@ -272,12 +272,20 @@ class MarkItDown:
# Do we have anything on which to base a guess?
base_guess = None
if stream_info is not None or file_extension is not None or url is not None:
base_guess = stream_info if stream_info is not None else StreamInfo()
# Start with a non-Null base guess
if stream_info is None:
base_guess = StreamInfo()
else:
base_guess = stream_info
if file_extension is not None:
# Deprecated -- use stream_info
assert base_guess is not None # for mypy
base_guess = base_guess.copy_and_update(extension=file_extension)
if url is not None:
# Deprecated -- use stream_info
assert base_guess is not None # for mypy
base_guess = base_guess.copy_and_update(url=url)
# Append the base guess, if it's non-trivial
@ -498,6 +506,6 @@ class MarkItDown:
)
self.register_converter(converter)
def register_converter(self, converter: Union[DocumentConverter]) -> None:
def register_converter(self, converter: DocumentConverter) -> None:
"""Register a page text converter."""
self._converters.insert(0, converter)

View file

@ -76,7 +76,7 @@ def _guess_stream_info_from_stream(
def _puremagic(
file_stream, filename_hint
) -> puremagic.main.PureMagicWithConfidence:
) -> List[puremagic.main.PureMagicWithConfidence]:
"""Wrap guesses to handle exceptions."""
try:
return puremagic.magic_stream(file_stream, filename=filename_hint)

View file

@ -81,9 +81,11 @@ class DocumentIntelligenceConverter(DocumentConverter):
if _dependency_exc_info is not None:
raise MissingDependencyException(
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
) from _dependency_exc_info[1].with_traceback(
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
) # Restore the original traceback
)
self.endpoint = endpoint
self.api_version = api_version

View file

@ -67,9 +67,11 @@ class DocxConverter(HtmlConverter):
extension=".docx",
feature="docx",
)
) from _dependency_exc_info[1].with_traceback(
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
) # Restore the original traceback
)
style_map = kwargs.get("style_map", None)
return self._html_converter.convert_string(

View file

@ -5,12 +5,12 @@ import sys
import shutil
import os
import warnings
from typing import BinaryIO, Literal, Optional
from typing import BinaryIO, Optional, Any
def exiftool_metadata(
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
) -> dict[str, Literal]:
) -> Any: # Need a better type for json data
# Check if we have a valid pointer to exiftool
if not exiftool_path:
which_exiftool = shutil.which("exiftool")

View file

@ -4,7 +4,6 @@ import mimetypes
from ._exiftool import exiftool_metadata
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
ACCEPTED_MIME_TYPE_PREFIXES = [
"image/jpeg",
@ -75,18 +74,17 @@ class ImageConverter(DocumentConverter):
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
md_content += (
"\n# Description:\n"
+ self._get_llm_description(
file_stream,
stream_info,
client=llm_client,
model=llm_model,
prompt=kwargs.get("llm_prompt"),
).strip()
+ "\n"
llm_description = self._get_llm_description(
file_stream,
stream_info,
client=llm_client,
model=llm_model,
prompt=kwargs.get("llm_prompt"),
)
if llm_description is not None:
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
return DocumentConverterResult(
markdown=md_content,
)
@ -106,7 +104,9 @@ class ImageConverter(DocumentConverter):
# Get the content type
content_type = stream_info.mimetype
if not content_type:
content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
content_type, _ = mimetypes.guess_type(
"_dummy" + (stream_info.extension or "")
)
if not content_type:
content_type = "application/octet-stream"

View file

@ -1,9 +1,15 @@
from typing import BinaryIO, Any
import json
from typing import Any, Union
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import FileConversionException
from .._stream_info import StreamInfo
CANDIDATE_MIME_TYPE_PREFIXES = [
"application/json",
]
ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
class IpynbConverter(DocumentConverter):
@ -14,23 +20,48 @@ class IpynbConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not ipynb
extension = kwargs.get("file_extension", "")
if extension.lower() != ".ipynb":
return None
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
# Read further to see if it's a notebook
cur_pos = file_stream.tell()
try:
encoding = stream_info.charset or "utf-8"
notebook_content = file_stream.read().decode(encoding)
return (
"nbformat" in notebook_content
and "nbformat_minor" in notebook_content
)
finally:
file_stream.seek(cur_pos)
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse and convert the notebook
result = None
with open(local_path, "rt", encoding="utf-8") as fh:
notebook_content = json.load(fh)
result = self._convert(notebook_content)
return result
encoding = stream_info.charset or "utf-8"
notebook_content = file_stream.read().decode(encoding=encoding)
return self._convert(json.loads(notebook_content))
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
def _convert(self, notebook_content: dict) -> DocumentConverterResult:
"""Helper function that converts notebook JSON content to Markdown."""
try:
md_output = []

View file

@ -13,7 +13,7 @@ def llm_caption(
# Get the content type
content_type = stream_info.mimetype
if not content_type:
content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
if not content_type:
content_type = "application/octet-stream"

View file

@ -87,9 +87,11 @@ class OutlookMsgConverter(DocumentConverter):
extension=".msg",
feature="outlook",
)
) from _dependency_exc_info[1].with_traceback(
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
) # Restore the original traceback
)
msg = olefile.OleFileIO(file_stream)
# Extract email metadata

View file

@ -1,7 +1,9 @@
import sys
import io
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
@ -69,10 +71,13 @@ class PdfConverter(DocumentConverter):
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[1].with_traceback(
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
) # Restore the original traceback
)
assert isinstance(file_stream, io.IOBase) # for mypy
return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream),
)

View file

@ -73,9 +73,11 @@ class PptxConverter(DocumentConverter):
extension=".pptx",
feature="pptx",
)
) from _dependency_exc_info[1].with_traceback(
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
) # Restore the original traceback
)
# Perform the conversion
presentation = pptx.Presentation(file_stream)

View file

@ -19,7 +19,11 @@ def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str
if _dependency_exc_info is not None:
raise MissingDependencyException(
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
if audio_format in ["wav", "aiff", "flac"]:
audio_source = file_stream

View file

@ -76,9 +76,11 @@ class XlsxConverter(DocumentConverter):
extension=".xlsx",
feature="xlsx",
)
) from _xlsx_dependency_exc_info[1].with_traceback(
) from _xlsx_dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_xlsx_dependency_exc_info[2]
) # Restore the original traceback
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = ""
@ -136,9 +138,11 @@ class XlsConverter(DocumentConverter):
extension=".xls",
feature="xls",
)
) from _xls_dependency_exc_info[1].with_traceback(
) from _xls_dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_xls_dependency_exc_info[2]
) # Restore the original traceback
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
md_content = ""

View file

@ -92,7 +92,7 @@ class ZipConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
file_path = stream_info.url or stream_info.local_path or stream_info.file_name
file_path = stream_info.url or stream_info.local_path or stream_info.filename
md_content = f"Content from the zip file `{file_path}`:\n\n"
with zipfile.ZipFile(file_stream, "r") as zipObj:

View file

@ -7,7 +7,7 @@ from markitdown import __version__
try:
from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
except ImportError:
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS # type: ignore
@pytest.fixture(scope="session")