Fixed many mypy errors.
This commit is contained in:
parent
aa94bce6d9
commit
84f8198d8a
15 changed files with 115 additions and 55 deletions
|
|
@ -50,10 +50,10 @@ from ._exceptions import (
|
|||
# Override mimetype for csv to fix issue on windows
|
||||
mimetypes.add_type("text/csv", ".csv")
|
||||
|
||||
_plugins: Union[None | List[Any]] = None
|
||||
_plugins: List[Any] = []
|
||||
|
||||
|
||||
def _load_plugins() -> Union[None | List[Any]]:
|
||||
def _load_plugins() -> List[Any]:
|
||||
"""Lazy load plugins, exiting early if already loaded."""
|
||||
global _plugins
|
||||
|
||||
|
|
@ -94,10 +94,10 @@ class MarkItDown:
|
|||
self._requests_session = requests_session
|
||||
|
||||
# TODO - remove these (see enable_builtins)
|
||||
self._llm_client = None
|
||||
self._llm_model = None
|
||||
self._exiftool_path = None
|
||||
self._style_map = None
|
||||
self._llm_client: Any = None
|
||||
self._llm_model: Union[str | None] = None
|
||||
self._exiftool_path: Union[str | None] = None
|
||||
self._style_map: Union[str | None] = None
|
||||
|
||||
# Register the converters
|
||||
self._converters: List[DocumentConverter] = []
|
||||
|
|
@ -272,12 +272,20 @@ class MarkItDown:
|
|||
# Do we have anything on which to base a guess?
|
||||
base_guess = None
|
||||
if stream_info is not None or file_extension is not None or url is not None:
|
||||
base_guess = stream_info if stream_info is not None else StreamInfo()
|
||||
# Start with a non-Null base guess
|
||||
if stream_info is None:
|
||||
base_guess = StreamInfo()
|
||||
else:
|
||||
base_guess = stream_info
|
||||
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
# Append the base guess, if it's non-trivial
|
||||
|
|
@ -498,6 +506,6 @@ class MarkItDown:
|
|||
)
|
||||
self.register_converter(converter)
|
||||
|
||||
def register_converter(self, converter: Union[DocumentConverter]) -> None:
|
||||
def register_converter(self, converter: DocumentConverter) -> None:
|
||||
"""Register a page text converter."""
|
||||
self._converters.insert(0, converter)
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ def _guess_stream_info_from_stream(
|
|||
|
||||
def _puremagic(
|
||||
file_stream, filename_hint
|
||||
) -> puremagic.main.PureMagicWithConfidence:
|
||||
) -> List[puremagic.main.PureMagicWithConfidence]:
|
||||
"""Wrap guesses to handle exceptions."""
|
||||
try:
|
||||
return puremagic.magic_stream(file_stream, filename=filename_hint)
|
||||
|
|
|
|||
|
|
@ -81,9 +81,11 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
|
|
|
|||
|
|
@ -67,9 +67,11 @@ class DocxConverter(HtmlConverter):
|
|||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
style_map = kwargs.get("style_map", None)
|
||||
return self._html_converter.convert_string(
|
||||
|
|
|
|||
|
|
@ -5,12 +5,12 @@ import sys
|
|||
import shutil
|
||||
import os
|
||||
import warnings
|
||||
from typing import BinaryIO, Literal, Optional
|
||||
from typing import BinaryIO, Optional, Any
|
||||
|
||||
|
||||
def exiftool_metadata(
|
||||
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
||||
) -> dict[str, Literal]:
|
||||
) -> Any: # Need a better type for json data
|
||||
# Check if we have a valid pointer to exiftool
|
||||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import mimetypes
|
|||
from ._exiftool import exiftool_metadata
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"image/jpeg",
|
||||
|
|
@ -75,18 +74,17 @@ class ImageConverter(DocumentConverter):
|
|||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
md_content += (
|
||||
"\n# Description:\n"
|
||||
+ self._get_llm_description(
|
||||
llm_description = self._get_llm_description(
|
||||
file_stream,
|
||||
stream_info,
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
).strip()
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
if llm_description is not None:
|
||||
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_content,
|
||||
)
|
||||
|
|
@ -106,7 +104,9 @@ class ImageConverter(DocumentConverter):
|
|||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
|
||||
content_type, _ = mimetypes.guess_type(
|
||||
"_dummy" + (stream_info.extension or "")
|
||||
)
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,15 @@
|
|||
from typing import BinaryIO, Any
|
||||
import json
|
||||
from typing import Any, Union
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from .._exceptions import FileConversionException
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||
"application/json",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
||||
|
||||
|
||||
class IpynbConverter(DocumentConverter):
|
||||
|
|
@ -14,23 +20,48 @@ class IpynbConverter(DocumentConverter):
|
|||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not ipynb
|
||||
extension = kwargs.get("file_extension", "")
|
||||
if extension.lower() != ".ipynb":
|
||||
return None
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
# Read further to see if it's a notebook
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding)
|
||||
return (
|
||||
"nbformat" in notebook_content
|
||||
and "nbformat_minor" in notebook_content
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse and convert the notebook
|
||||
result = None
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
notebook_content = json.load(fh)
|
||||
result = self._convert(notebook_content)
|
||||
|
||||
return result
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding=encoding)
|
||||
return self._convert(json.loads(notebook_content))
|
||||
|
||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
||||
def _convert(self, notebook_content: dict) -> DocumentConverterResult:
|
||||
"""Helper function that converts notebook JSON content to Markdown."""
|
||||
try:
|
||||
md_output = []
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ def llm_caption(
|
|||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
|
||||
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
|
|
|
|||
|
|
@ -87,9 +87,11 @@ class OutlookMsgConverter(DocumentConverter):
|
|||
extension=".msg",
|
||||
feature="outlook",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
msg = olefile.OleFileIO(file_stream)
|
||||
# Extract email metadata
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
import sys
|
||||
import io
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
|
@ -69,10 +71,13 @@ class PdfConverter(DocumentConverter):
|
|||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
assert isinstance(file_stream, io.IOBase) # for mypy
|
||||
return DocumentConverterResult(
|
||||
markdown=pdfminer.high_level.extract_text(file_stream),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -73,9 +73,11 @@ class PptxConverter(DocumentConverter):
|
|||
extension=".pptx",
|
||||
feature="pptx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
# Perform the conversion
|
||||
presentation = pptx.Presentation(file_stream)
|
||||
|
|
|
|||
|
|
@ -19,7 +19,11 @@ def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str
|
|||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
||||
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
if audio_format in ["wav", "aiff", "flac"]:
|
||||
audio_source = file_stream
|
||||
|
|
|
|||
|
|
@ -76,9 +76,11 @@ class XlsxConverter(DocumentConverter):
|
|||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||
) from _xlsx_dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_xlsx_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
|
|
@ -136,9 +138,11 @@ class XlsConverter(DocumentConverter):
|
|||
extension=".xls",
|
||||
feature="xls",
|
||||
)
|
||||
) from _xls_dependency_exc_info[1].with_traceback(
|
||||
) from _xls_dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_xls_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ class ZipConverter(DocumentConverter):
|
|||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
file_path = stream_info.url or stream_info.local_path or stream_info.file_name
|
||||
file_path = stream_info.url or stream_info.local_path or stream_info.filename
|
||||
md_content = f"Content from the zip file `{file_path}`:\n\n"
|
||||
|
||||
with zipfile.ZipFile(file_stream, "r") as zipObj:
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from markitdown import __version__
|
|||
try:
|
||||
from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
||||
except ImportError:
|
||||
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
||||
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS # type: ignore
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
|
|||
Loading…
Reference in a new issue