Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple.
This commit is contained in:
parent
84f8198d8a
commit
a7ae7c53d8
20 changed files with 82 additions and 129 deletions
|
|
@ -3,7 +3,11 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
from ._markitdown import MarkItDown
|
from ._markitdown import (
|
||||||
|
MarkItDown,
|
||||||
|
PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||||
|
PRIORITY_GENERIC_FILE_FORMAT,
|
||||||
|
)
|
||||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
|
|
@ -25,4 +29,6 @@ __all__ = [
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
"StreamInfo",
|
"StreamInfo",
|
||||||
|
"PRIORITY_SPECIFIC_FILE_FORMAT",
|
||||||
|
"PRIORITY_GENERIC_FILE_FORMAT",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -45,38 +45,6 @@ class DocumentConverterResult:
|
||||||
class DocumentConverter:
|
class DocumentConverter:
|
||||||
"""Abstract superclass of all DocumentConverters."""
|
"""Abstract superclass of all DocumentConverters."""
|
||||||
|
|
||||||
# Lower priority values are tried first.
|
|
||||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
|
||||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
|
||||||
)
|
|
||||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
|
||||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
|
||||||
"""
|
|
||||||
Initialize the DocumentConverter with a given priority.
|
|
||||||
|
|
||||||
Priorities work as follows: By default, most converters get priority
|
|
||||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
|
||||||
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
|
||||||
with lower values being tried first (i.e., higher priority).
|
|
||||||
|
|
||||||
Just prior to conversion, the converters are sorted by priority, using
|
|
||||||
a stable sort. This means that converters with the same priority will
|
|
||||||
remain in the same order, with the most recently registered converters
|
|
||||||
appearing first.
|
|
||||||
|
|
||||||
We have tight control over the order of built-in converters, but
|
|
||||||
plugins can register converters in any order. A converter's priority
|
|
||||||
field reasserts some control over the order of converters.
|
|
||||||
|
|
||||||
Plugins can register converters with any priority, to appear before or
|
|
||||||
after the built-ins. For example, a plugin with priority 9 will run
|
|
||||||
before the PlainTextConverter, but after the built-in converters.
|
|
||||||
"""
|
|
||||||
self._priority = priority
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -138,12 +106,3 @@ class DocumentConverter:
|
||||||
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("Subclasses must implement this method")
|
raise NotImplementedError("Subclasses must implement this method")
|
||||||
|
|
||||||
@property
|
|
||||||
def priority(self) -> float:
|
|
||||||
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
|
||||||
return self._priority
|
|
||||||
|
|
||||||
@priority.setter
|
|
||||||
def priority(self, value: float):
|
|
||||||
self._priority = value
|
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
import traceback
|
import traceback
|
||||||
import io
|
import io
|
||||||
|
from dataclasses import dataclass
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from typing import Any, List, Optional, Union, BinaryIO
|
from typing import Any, List, Optional, Union, BinaryIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -47,8 +48,15 @@ from ._exceptions import (
|
||||||
FailedConversionAttempt,
|
FailedConversionAttempt,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Override mimetype for csv to fix issue on windows
|
|
||||||
mimetypes.add_type("text/csv", ".csv")
|
# Lower priority values are tried first.
|
||||||
|
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||||
|
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||||
|
)
|
||||||
|
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||||
|
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_plugins: List[Any] = []
|
_plugins: List[Any] = []
|
||||||
|
|
||||||
|
|
@ -73,6 +81,14 @@ def _load_plugins() -> List[Any]:
|
||||||
return _plugins
|
return _plugins
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True, frozen=True)
|
||||||
|
class ConverterRegistration:
|
||||||
|
"""A registration of a converter with its priority and other metadata."""
|
||||||
|
|
||||||
|
converter: DocumentConverter
|
||||||
|
priority: float
|
||||||
|
|
||||||
|
|
||||||
class MarkItDown:
|
class MarkItDown:
|
||||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||||
This reader will convert common file-types or webpages to Markdown."""
|
This reader will convert common file-types or webpages to Markdown."""
|
||||||
|
|
@ -100,7 +116,7 @@ class MarkItDown:
|
||||||
self._style_map: Union[str | None] = None
|
self._style_map: Union[str | None] = None
|
||||||
|
|
||||||
# Register the converters
|
# Register the converters
|
||||||
self._converters: List[DocumentConverter] = []
|
self._converters: List[ConverterRegistration] = []
|
||||||
|
|
||||||
if (
|
if (
|
||||||
enable_builtins is None or enable_builtins
|
enable_builtins is None or enable_builtins
|
||||||
|
|
@ -128,9 +144,15 @@ class MarkItDown:
|
||||||
# Register converters for successful browsing operations
|
# Register converters for successful browsing operations
|
||||||
# Later registrations are tried first / take higher priority than earlier registrations
|
# Later registrations are tried first / take higher priority than earlier registrations
|
||||||
# To this end, the most specific converters should appear below the most generic converters
|
# To this end, the most specific converters should appear below the most generic converters
|
||||||
self.register_converter(PlainTextConverter())
|
self.register_converter(
|
||||||
self.register_converter(ZipConverter(markitdown=self))
|
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||||
self.register_converter(HtmlConverter())
|
)
|
||||||
|
self.register_converter(
|
||||||
|
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||||
|
)
|
||||||
|
self.register_converter(
|
||||||
|
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||||
|
)
|
||||||
self.register_converter(RssConverter())
|
self.register_converter(RssConverter())
|
||||||
self.register_converter(WikipediaConverter())
|
self.register_converter(WikipediaConverter())
|
||||||
self.register_converter(YouTubeConverter())
|
self.register_converter(YouTubeConverter())
|
||||||
|
|
@ -418,13 +440,14 @@ class MarkItDown:
|
||||||
# Create a copy of the page_converters list, sorted by priority.
|
# Create a copy of the page_converters list, sorted by priority.
|
||||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
|
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
|
||||||
|
|
||||||
# Remember the initial stream position so that we can return to it
|
# Remember the initial stream position so that we can return to it
|
||||||
cur_pos = file_stream.tell()
|
cur_pos = file_stream.tell()
|
||||||
|
|
||||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||||
for converter in sorted_converters:
|
for converter_registration in sorted_registrations:
|
||||||
|
converter = converter_registration.converter
|
||||||
# Sanity check -- make sure the cur_pos is still the same
|
# Sanity check -- make sure the cur_pos is still the same
|
||||||
assert (
|
assert (
|
||||||
cur_pos == file_stream.tell()
|
cur_pos == file_stream.tell()
|
||||||
|
|
@ -506,6 +529,34 @@ class MarkItDown:
|
||||||
)
|
)
|
||||||
self.register_converter(converter)
|
self.register_converter(converter)
|
||||||
|
|
||||||
def register_converter(self, converter: DocumentConverter) -> None:
|
def register_converter(
|
||||||
"""Register a page text converter."""
|
self,
|
||||||
self._converters.insert(0, converter)
|
converter: DocumentConverter,
|
||||||
|
*,
|
||||||
|
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Register a DocumentConverter with a given priority.
|
||||||
|
|
||||||
|
Priorities work as follows: By default, most converters get priority
|
||||||
|
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||||
|
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
|
||||||
|
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
|
||||||
|
being tried first (i.e., higher priority).
|
||||||
|
|
||||||
|
Just prior to conversion, the converters are sorted by priority, using
|
||||||
|
a stable sort. This means that converters with the same priority will
|
||||||
|
remain in the same order, with the most recently registered converters
|
||||||
|
appearing first.
|
||||||
|
|
||||||
|
We have tight control over the order of built-in converters, but
|
||||||
|
plugins can register converters in any order. The registration's priority
|
||||||
|
field reasserts some control over the order of converters.
|
||||||
|
|
||||||
|
Plugins can register converters with any priority, to appear before or
|
||||||
|
after the built-ins. For example, a plugin with priority 9 will run
|
||||||
|
before the PlainTextConverter, but after the built-in converters.
|
||||||
|
"""
|
||||||
|
self._converters.insert(
|
||||||
|
0, ConverterRegistration(converter=converter, priority=priority)
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -26,11 +26,6 @@ class AudioConverter(DocumentConverter):
|
||||||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -26,11 +26,6 @@ class BingSerpConverter(DocumentConverter):
|
||||||
NOTE: It is better to use the Bing API
|
NOTE: It is better to use the Bing API
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -69,11 +69,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
|
||||||
endpoint: str,
|
endpoint: str,
|
||||||
api_version: str = "2024-07-31-preview",
|
api_version: str = "2024-07-31-preview",
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__()
|
||||||
|
|
||||||
# Raise an error if the dependencies are not available.
|
# Raise an error if the dependencies are not available.
|
||||||
# This is different than other converters since this one isn't even instantiated
|
# This is different than other converters since this one isn't even instantiated
|
||||||
|
|
|
||||||
|
|
@ -29,10 +29,8 @@ class DocxConverter(HtmlConverter):
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
super().__init__()
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
|
|
|
||||||
|
|
@ -20,11 +20,6 @@ ACCEPTED_FILE_EXTENSIONS = [
|
||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
"""Anything with content type text/html"""
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -18,11 +18,6 @@ class ImageConverter(DocumentConverter):
|
||||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -15,11 +15,6 @@ ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
||||||
class IpynbConverter(DocumentConverter):
|
class IpynbConverter(DocumentConverter):
|
||||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -28,11 +28,6 @@ class OutlookMsgConverter(DocumentConverter):
|
||||||
- Email body content
|
- Email body content
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -34,11 +34,6 @@ class PdfConverter(DocumentConverter):
|
||||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -29,11 +29,6 @@ IGNORE_MIME_TYPE_PREFIXES = [
|
||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -35,10 +35,8 @@ class PptxConverter(DocumentConverter):
|
||||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
super().__init__()
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
|
|
|
||||||
|
|
@ -26,11 +26,6 @@ CANDIDATE_FILE_EXTENSIONS = [
|
||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
"""Convert RSS / Atom type to markdown"""
|
"""Convert RSS / Atom type to markdown"""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -21,11 +21,6 @@ ACCEPTED_FILE_EXTENSIONS = [
|
||||||
class WikipediaConverter(DocumentConverter):
|
class WikipediaConverter(DocumentConverter):
|
||||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -38,10 +38,8 @@ class XlsxConverter(DocumentConverter):
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
super().__init__()
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
|
|
@ -100,10 +98,8 @@ class XlsConverter(DocumentConverter):
|
||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self):
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
super().__init__()
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
|
|
|
||||||
|
|
@ -34,11 +34,6 @@ ACCEPTED_FILE_EXTENSIONS = [
|
||||||
class YouTubeConverter(DocumentConverter):
|
class YouTubeConverter(DocumentConverter):
|
||||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
||||||
):
|
|
||||||
super().__init__(priority=priority)
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -61,11 +61,10 @@ class ZipConverter(DocumentConverter):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
|
||||||
*,
|
*,
|
||||||
markitdown: "MarkItDown",
|
markitdown: "MarkItDown",
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__()
|
||||||
self._markitdown = markitdown
|
self._markitdown = markitdown
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
|
|
|
||||||
|
|
@ -530,8 +530,10 @@ def test_markitdown_exiftool() -> None:
|
||||||
finally:
|
finally:
|
||||||
warnings.resetwarnings()
|
warnings.resetwarnings()
|
||||||
|
|
||||||
# Test explicitly setting the location of exiftool
|
|
||||||
which_exiftool = shutil.which("exiftool")
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
assert which_exiftool is not None
|
||||||
|
|
||||||
|
# Test explicitly setting the location of exiftool
|
||||||
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||||
for key in JPG_TEST_EXIFTOOL:
|
for key in JPG_TEST_EXIFTOOL:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue