Fixed many mypy errors.
This commit is contained in:
parent
aa94bce6d9
commit
84f8198d8a
15 changed files with 115 additions and 55 deletions
|
|
@ -50,10 +50,10 @@ from ._exceptions import (
|
||||||
# Override mimetype for csv to fix issue on windows
|
# Override mimetype for csv to fix issue on windows
|
||||||
mimetypes.add_type("text/csv", ".csv")
|
mimetypes.add_type("text/csv", ".csv")
|
||||||
|
|
||||||
_plugins: Union[None | List[Any]] = None
|
_plugins: List[Any] = []
|
||||||
|
|
||||||
|
|
||||||
def _load_plugins() -> Union[None | List[Any]]:
|
def _load_plugins() -> List[Any]:
|
||||||
"""Lazy load plugins, exiting early if already loaded."""
|
"""Lazy load plugins, exiting early if already loaded."""
|
||||||
global _plugins
|
global _plugins
|
||||||
|
|
||||||
|
|
@ -94,10 +94,10 @@ class MarkItDown:
|
||||||
self._requests_session = requests_session
|
self._requests_session = requests_session
|
||||||
|
|
||||||
# TODO - remove these (see enable_builtins)
|
# TODO - remove these (see enable_builtins)
|
||||||
self._llm_client = None
|
self._llm_client: Any = None
|
||||||
self._llm_model = None
|
self._llm_model: Union[str | None] = None
|
||||||
self._exiftool_path = None
|
self._exiftool_path: Union[str | None] = None
|
||||||
self._style_map = None
|
self._style_map: Union[str | None] = None
|
||||||
|
|
||||||
# Register the converters
|
# Register the converters
|
||||||
self._converters: List[DocumentConverter] = []
|
self._converters: List[DocumentConverter] = []
|
||||||
|
|
@ -272,12 +272,20 @@ class MarkItDown:
|
||||||
# Do we have anything on which to base a guess?
|
# Do we have anything on which to base a guess?
|
||||||
base_guess = None
|
base_guess = None
|
||||||
if stream_info is not None or file_extension is not None or url is not None:
|
if stream_info is not None or file_extension is not None or url is not None:
|
||||||
base_guess = stream_info if stream_info is not None else StreamInfo()
|
# Start with a non-Null base guess
|
||||||
|
if stream_info is None:
|
||||||
|
base_guess = StreamInfo()
|
||||||
|
else:
|
||||||
|
base_guess = stream_info
|
||||||
|
|
||||||
if file_extension is not None:
|
if file_extension is not None:
|
||||||
# Deprecated -- use stream_info
|
# Deprecated -- use stream_info
|
||||||
|
assert base_guess is not None # for mypy
|
||||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||||
|
|
||||||
if url is not None:
|
if url is not None:
|
||||||
# Deprecated -- use stream_info
|
# Deprecated -- use stream_info
|
||||||
|
assert base_guess is not None # for mypy
|
||||||
base_guess = base_guess.copy_and_update(url=url)
|
base_guess = base_guess.copy_and_update(url=url)
|
||||||
|
|
||||||
# Append the base guess, if it's non-trivial
|
# Append the base guess, if it's non-trivial
|
||||||
|
|
@ -498,6 +506,6 @@ class MarkItDown:
|
||||||
)
|
)
|
||||||
self.register_converter(converter)
|
self.register_converter(converter)
|
||||||
|
|
||||||
def register_converter(self, converter: Union[DocumentConverter]) -> None:
|
def register_converter(self, converter: DocumentConverter) -> None:
|
||||||
"""Register a page text converter."""
|
"""Register a page text converter."""
|
||||||
self._converters.insert(0, converter)
|
self._converters.insert(0, converter)
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,7 @@ def _guess_stream_info_from_stream(
|
||||||
|
|
||||||
def _puremagic(
|
def _puremagic(
|
||||||
file_stream, filename_hint
|
file_stream, filename_hint
|
||||||
) -> puremagic.main.PureMagicWithConfidence:
|
) -> List[puremagic.main.PureMagicWithConfidence]:
|
||||||
"""Wrap guesses to handle exceptions."""
|
"""Wrap guesses to handle exceptions."""
|
||||||
try:
|
try:
|
||||||
return puremagic.magic_stream(file_stream, filename=filename_hint)
|
return puremagic.magic_stream(file_stream, filename=filename_hint)
|
||||||
|
|
|
||||||
|
|
@ -81,9 +81,11 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
self.endpoint = endpoint
|
self.endpoint = endpoint
|
||||||
self.api_version = api_version
|
self.api_version = api_version
|
||||||
|
|
|
||||||
|
|
@ -67,9 +67,11 @@ class DocxConverter(HtmlConverter):
|
||||||
extension=".docx",
|
extension=".docx",
|
||||||
feature="docx",
|
feature="docx",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
return self._html_converter.convert_string(
|
return self._html_converter.convert_string(
|
||||||
|
|
|
||||||
|
|
@ -5,12 +5,12 @@ import sys
|
||||||
import shutil
|
import shutil
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from typing import BinaryIO, Literal, Optional
|
from typing import BinaryIO, Optional, Any
|
||||||
|
|
||||||
|
|
||||||
def exiftool_metadata(
|
def exiftool_metadata(
|
||||||
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
file_stream: BinaryIO, *, exiftool_path: Optional[str] = None
|
||||||
) -> dict[str, Literal]:
|
) -> Any: # Need a better type for json data
|
||||||
# Check if we have a valid pointer to exiftool
|
# Check if we have a valid pointer to exiftool
|
||||||
if not exiftool_path:
|
if not exiftool_path:
|
||||||
which_exiftool = shutil.which("exiftool")
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import mimetypes
|
||||||
from ._exiftool import exiftool_metadata
|
from ._exiftool import exiftool_metadata
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
||||||
|
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
"image/jpeg",
|
"image/jpeg",
|
||||||
|
|
@ -75,18 +74,17 @@ class ImageConverter(DocumentConverter):
|
||||||
llm_client = kwargs.get("llm_client")
|
llm_client = kwargs.get("llm_client")
|
||||||
llm_model = kwargs.get("llm_model")
|
llm_model = kwargs.get("llm_model")
|
||||||
if llm_client is not None and llm_model is not None:
|
if llm_client is not None and llm_model is not None:
|
||||||
md_content += (
|
llm_description = self._get_llm_description(
|
||||||
"\n# Description:\n"
|
|
||||||
+ self._get_llm_description(
|
|
||||||
file_stream,
|
file_stream,
|
||||||
stream_info,
|
stream_info,
|
||||||
client=llm_client,
|
client=llm_client,
|
||||||
model=llm_model,
|
model=llm_model,
|
||||||
prompt=kwargs.get("llm_prompt"),
|
prompt=kwargs.get("llm_prompt"),
|
||||||
).strip()
|
|
||||||
+ "\n"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if llm_description is not None:
|
||||||
|
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=md_content,
|
markdown=md_content,
|
||||||
)
|
)
|
||||||
|
|
@ -106,7 +104,9 @@ class ImageConverter(DocumentConverter):
|
||||||
# Get the content type
|
# Get the content type
|
||||||
content_type = stream_info.mimetype
|
content_type = stream_info.mimetype
|
||||||
if not content_type:
|
if not content_type:
|
||||||
content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
|
content_type, _ = mimetypes.guess_type(
|
||||||
|
"_dummy" + (stream_info.extension or "")
|
||||||
|
)
|
||||||
if not content_type:
|
if not content_type:
|
||||||
content_type = "application/octet-stream"
|
content_type = "application/octet-stream"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,15 @@
|
||||||
|
from typing import BinaryIO, Any
|
||||||
import json
|
import json
|
||||||
from typing import Any, Union
|
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
from .._exceptions import FileConversionException
|
from .._exceptions import FileConversionException
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
|
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/json",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
||||||
|
|
||||||
|
|
||||||
class IpynbConverter(DocumentConverter):
|
class IpynbConverter(DocumentConverter):
|
||||||
|
|
@ -14,23 +20,48 @@ class IpynbConverter(DocumentConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def accepts(
|
||||||
self, local_path: str, **kwargs: Any
|
self,
|
||||||
) -> Union[None, DocumentConverterResult]:
|
file_stream: BinaryIO,
|
||||||
# Bail if not ipynb
|
stream_info: StreamInfo,
|
||||||
extension = kwargs.get("file_extension", "")
|
**kwargs: Any, # Options to pass to the converter
|
||||||
if extension.lower() != ".ipynb":
|
) -> bool:
|
||||||
return None
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
# Read further to see if it's a notebook
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
try:
|
||||||
|
encoding = stream_info.charset or "utf-8"
|
||||||
|
notebook_content = file_stream.read().decode(encoding)
|
||||||
|
return (
|
||||||
|
"nbformat" in notebook_content
|
||||||
|
and "nbformat_minor" in notebook_content
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
# Parse and convert the notebook
|
# Parse and convert the notebook
|
||||||
result = None
|
result = None
|
||||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
|
||||||
notebook_content = json.load(fh)
|
|
||||||
result = self._convert(notebook_content)
|
|
||||||
|
|
||||||
return result
|
encoding = stream_info.charset or "utf-8"
|
||||||
|
notebook_content = file_stream.read().decode(encoding=encoding)
|
||||||
|
return self._convert(json.loads(notebook_content))
|
||||||
|
|
||||||
def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
|
def _convert(self, notebook_content: dict) -> DocumentConverterResult:
|
||||||
"""Helper function that converts notebook JSON content to Markdown."""
|
"""Helper function that converts notebook JSON content to Markdown."""
|
||||||
try:
|
try:
|
||||||
md_output = []
|
md_output = []
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ def llm_caption(
|
||||||
# Get the content type
|
# Get the content type
|
||||||
content_type = stream_info.mimetype
|
content_type = stream_info.mimetype
|
||||||
if not content_type:
|
if not content_type:
|
||||||
content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
|
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
|
||||||
if not content_type:
|
if not content_type:
|
||||||
content_type = "application/octet-stream"
|
content_type = "application/octet-stream"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,9 +87,11 @@ class OutlookMsgConverter(DocumentConverter):
|
||||||
extension=".msg",
|
extension=".msg",
|
||||||
feature="outlook",
|
feature="outlook",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
msg = olefile.OleFileIO(file_stream)
|
msg = olefile.OleFileIO(file_stream)
|
||||||
# Extract email metadata
|
# Extract email metadata
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,9 @@
|
||||||
import sys
|
import sys
|
||||||
|
import io
|
||||||
|
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
|
@ -69,10 +71,13 @@ class PdfConverter(DocumentConverter):
|
||||||
extension=".pdf",
|
extension=".pdf",
|
||||||
feature="pdf",
|
feature="pdf",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
|
assert isinstance(file_stream, io.IOBase) # for mypy
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=pdfminer.high_level.extract_text(file_stream),
|
markdown=pdfminer.high_level.extract_text(file_stream),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -73,9 +73,11 @@ class PptxConverter(DocumentConverter):
|
||||||
extension=".pptx",
|
extension=".pptx",
|
||||||
feature="pptx",
|
feature="pptx",
|
||||||
)
|
)
|
||||||
) from _dependency_exc_info[1].with_traceback(
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
# Perform the conversion
|
# Perform the conversion
|
||||||
presentation = pptx.Presentation(file_stream)
|
presentation = pptx.Presentation(file_stream)
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,11 @@ def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str
|
||||||
if _dependency_exc_info is not None:
|
if _dependency_exc_info is not None:
|
||||||
raise MissingDependencyException(
|
raise MissingDependencyException(
|
||||||
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
||||||
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
|
_dependency_exc_info[2]
|
||||||
|
)
|
||||||
|
|
||||||
if audio_format in ["wav", "aiff", "flac"]:
|
if audio_format in ["wav", "aiff", "flac"]:
|
||||||
audio_source = file_stream
|
audio_source = file_stream
|
||||||
|
|
|
||||||
|
|
@ -76,9 +76,11 @@ class XlsxConverter(DocumentConverter):
|
||||||
extension=".xlsx",
|
extension=".xlsx",
|
||||||
feature="xlsx",
|
feature="xlsx",
|
||||||
)
|
)
|
||||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
) from _xlsx_dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_xlsx_dependency_exc_info[2]
|
_xlsx_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
@ -136,9 +138,11 @@ class XlsConverter(DocumentConverter):
|
||||||
extension=".xls",
|
extension=".xls",
|
||||||
feature="xls",
|
feature="xls",
|
||||||
)
|
)
|
||||||
) from _xls_dependency_exc_info[1].with_traceback(
|
) from _xls_dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
_xls_dependency_exc_info[2]
|
_xls_dependency_exc_info[2]
|
||||||
) # Restore the original traceback
|
)
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
|
||||||
|
|
@ -92,7 +92,7 @@ class ZipConverter(DocumentConverter):
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
file_path = stream_info.url or stream_info.local_path or stream_info.file_name
|
file_path = stream_info.url or stream_info.local_path or stream_info.filename
|
||||||
md_content = f"Content from the zip file `{file_path}`:\n\n"
|
md_content = f"Content from the zip file `{file_path}`:\n\n"
|
||||||
|
|
||||||
with zipfile.ZipFile(file_stream, "r") as zipObj:
|
with zipfile.ZipFile(file_stream, "r") as zipObj:
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from markitdown import __version__
|
||||||
try:
|
try:
|
||||||
from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
from .test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS
|
from test_markitdown import TEST_FILES_DIR, DOCX_TEST_STRINGS # type: ignore
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue