More progress.
This commit is contained in:
parent
7bc6d827ee
commit
4129f30c23
23 changed files with 149 additions and 201 deletions
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown
|
||||
from ._base_converter import DocumentConverterResult, BaseDocumentConverter
|
||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||
from ._stream_info import StreamInfo
|
||||
from ._exceptions import (
|
||||
MarkItDownException,
|
||||
|
|
@ -13,13 +13,11 @@ from ._exceptions import (
|
|||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
)
|
||||
from .converters import DocumentConverter
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"MarkItDown",
|
||||
"DocumentConverter",
|
||||
"BaseDocumentConverter",
|
||||
"DocumentConverterResult",
|
||||
"MarkItDownException",
|
||||
"MissingDependencyException",
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
import os
|
||||
import tempfile
|
||||
from warnings import warn
|
||||
from typing import Any, Union, BinaryIO, Optional, List
|
||||
from ._stream_info import StreamInfo
|
||||
from typing import Any, Union, BinaryIO, Optional
|
||||
|
||||
# Avoid printing the same warning multiple times
|
||||
_WARNED: List[str] = []
|
||||
|
||||
|
||||
class DocumentConverterResult:
|
||||
|
|
@ -39,7 +45,7 @@ class DocumentConverterResult:
|
|||
return self.markdown
|
||||
|
||||
|
||||
class BaseDocumentConverter:
|
||||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
# Lower priority values are tried first.
|
||||
|
|
@ -74,7 +80,7 @@ class BaseDocumentConverter:
|
|||
"""
|
||||
self._priority = priority
|
||||
|
||||
def convert(
|
||||
def convert_stream(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
|
|
@ -106,6 +112,61 @@ class BaseDocumentConverter:
|
|||
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
||||
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||
"""
|
||||
|
||||
# Default implementation ensures backward compatibility with the legacy convert() method, and
|
||||
# should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
|
||||
# in the future.
|
||||
result = None
|
||||
used_legacy = False
|
||||
|
||||
if stream_info.local_path is not None and os.path.exists(
|
||||
stream_info.local_path
|
||||
):
|
||||
# If the stream is backed by a local file, pass it to the legacy convert() method
|
||||
try:
|
||||
result = self.convert(stream_info.local_path, **kwargs)
|
||||
used_legacy = True
|
||||
except (
|
||||
NotImplementedError
|
||||
): # If it wasn't implemented, rethrow the error, but with this as the stack trace
|
||||
raise NotImplementedError(
|
||||
"Subclasses must implement the convert_stream method."
|
||||
)
|
||||
else:
|
||||
# Otherwise, we need to read the stream into a temporary file. There is potential for
|
||||
# thrashing here if there are many converters or conversion attempts
|
||||
cur_pos = file_stream.tell()
|
||||
temp_fd, temp_path = tempfile.mkstemp()
|
||||
try:
|
||||
with os.fdopen(temp_fd, "wb") as temp_file:
|
||||
temp_file.write(file_stream.read())
|
||||
try:
|
||||
result = self.convert(temp_path, **kwargs)
|
||||
used_legacy = True
|
||||
except NotImplementedError:
|
||||
raise NotImplementedError(
|
||||
"Subclasses must implement the convert_stream method."
|
||||
)
|
||||
finally:
|
||||
os.remove(temp_path)
|
||||
file_stream.seek(0)
|
||||
|
||||
if used_legacy:
|
||||
message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
|
||||
if message not in _WARNED:
|
||||
warn(message, DeprecationWarning)
|
||||
_WARNED.append(message)
|
||||
|
||||
return result
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
"""
|
||||
Legacy, and deprecated method to convert a document to Markdown text.
|
||||
This method reads from the file at `local_path` and returns the converted Markdown text.
|
||||
This method is deprecated in favor of `convert_stream`, which uses a file-like object.
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must implement this method")
|
||||
|
||||
@property
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ import requests
|
|||
from ._stream_info import StreamInfo
|
||||
|
||||
from .converters import (
|
||||
DocumentConverter,
|
||||
PlainTextConverter,
|
||||
HtmlConverter,
|
||||
RssConverter,
|
||||
|
|
@ -41,7 +40,7 @@ from .converters import (
|
|||
DocumentIntelligenceConverter,
|
||||
)
|
||||
|
||||
from ._base_converter import DocumentConverterResult
|
||||
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from ._exceptions import (
|
||||
FileConversionException,
|
||||
|
|
@ -102,7 +101,7 @@ class MarkItDown:
|
|||
self._style_map = None
|
||||
|
||||
# Register the converters
|
||||
self._page_converters: List[DocumentConverter] = []
|
||||
self._converters: List[DocumentConverter] = []
|
||||
|
||||
if (
|
||||
enable_builtins is None or enable_builtins
|
||||
|
|
@ -405,26 +404,6 @@ class MarkItDown:
|
|||
def _convert(
|
||||
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
# Lazily create a temporary file, if needed, for backward compatibility
|
||||
# This is to support a deprecated feature, and will be removed in the future
|
||||
temp_file = None
|
||||
|
||||
def get_temp_file():
|
||||
nonlocal temp_file
|
||||
|
||||
if temp_file is not None:
|
||||
return temp_file
|
||||
else:
|
||||
cur_pos = file_stream.tell()
|
||||
handle, temp_file = tempfile.mkstemp()
|
||||
fh = os.fdopen(handle, "wb")
|
||||
file_stream.seek(0)
|
||||
fh.write(file_stream.read())
|
||||
file_stream.seek(cur_pos)
|
||||
fh.close()
|
||||
return temp_file
|
||||
|
||||
try:
|
||||
res: Union[None, DocumentConverterResult] = None
|
||||
|
||||
# Keep track of which converters throw exceptions
|
||||
|
|
@ -433,9 +412,9 @@ class MarkItDown:
|
|||
# Create a copy of the page_converters list, sorted by priority.
|
||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
|
||||
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
|
||||
|
||||
for file_info in stream_info_guesses + [None]:
|
||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||
for converter in sorted_converters:
|
||||
_kwargs = copy.deepcopy(kwargs)
|
||||
|
||||
|
|
@ -449,44 +428,37 @@ class MarkItDown:
|
|||
if "style_map" not in _kwargs and self._style_map is not None:
|
||||
_kwargs["style_map"] = self._style_map
|
||||
|
||||
if (
|
||||
"exiftool_path" not in _kwargs
|
||||
and self._exiftool_path is not None
|
||||
):
|
||||
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
|
||||
_kwargs["exiftool_path"] = self._exiftool_path
|
||||
|
||||
# Add the list of converters for nested processing
|
||||
_kwargs["_parent_converters"] = self._page_converters
|
||||
_kwargs["_parent_converters"] = self._converters
|
||||
|
||||
# Add backwards compatibility
|
||||
if isinstance(converter, DocumentConverter):
|
||||
if file_info is not None:
|
||||
# Legacy converters need a file_extension
|
||||
if file_info.extension is not None:
|
||||
_kwargs["file_extension"] = file_info.extension
|
||||
# Add legaxy kwargs
|
||||
if stream_info is not None:
|
||||
if stream_info.extension is not None:
|
||||
_kwargs["file_extension"] = stream_info.extension
|
||||
|
||||
# And benefit from urls, when available
|
||||
if file_info.url is not None:
|
||||
_kwargs["url"] = file_info.url
|
||||
if stream_info.url is not None:
|
||||
_kwargs["url"] = stream_info.url
|
||||
|
||||
# Attempt the conversion
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
res = converter.convert(get_temp_file(), **_kwargs)
|
||||
res = converter.convert_stream(file_stream, stream_info, **_kwargs)
|
||||
except Exception:
|
||||
failed_attempts.append(
|
||||
FailedConversionAttempt(
|
||||
converter=converter, exc_info=sys.exc_info()
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("TODO")
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
if res is not None:
|
||||
# Normalize the content
|
||||
res.text_content = "\n".join(
|
||||
[
|
||||
line.rstrip()
|
||||
for line in re.split(r"\r?\n", res.text_content)
|
||||
]
|
||||
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||
)
|
||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||
return res
|
||||
|
|
@ -500,14 +472,6 @@ class MarkItDown:
|
|||
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up the temporary file
|
||||
if temp_file is not None:
|
||||
try:
|
||||
os.unlink(temp_file)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||
"""DEPRECATED: User register_converter instead."""
|
||||
warn(
|
||||
|
|
@ -516,6 +480,6 @@ class MarkItDown:
|
|||
)
|
||||
self.register_converter(converter)
|
||||
|
||||
def register_converter(self, converter: DocumentConverter) -> None:
|
||||
def register_converter(self, converter: Union[DocumentConverter]) -> None:
|
||||
"""Register a page text converter."""
|
||||
self._page_converters.insert(0, converter)
|
||||
self._converters.insert(0, converter)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from ._plain_text_converter import PlainTextConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._rss_converter import RssConverter
|
||||
|
|
@ -22,7 +21,6 @@ from ._zip_converter import ZipConverter
|
|||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||
|
||||
__all__ = [
|
||||
"DocumentConverter",
|
||||
"PlainTextConverter",
|
||||
"HtmlConverter",
|
||||
"RssConverter",
|
||||
|
|
|
|||
|
|
@ -1,56 +0,0 @@
|
|||
from typing import Any, Union
|
||||
from .._base_converter import DocumentConverterResult
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
||||
"""
|
||||
Initialize the DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
||||
with lower values being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. A converter's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._priority = priority
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
raise NotImplementedError("Subclasses must implement this method")
|
||||
|
||||
@property
|
||||
def priority(self) -> float:
|
||||
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
||||
return self._priority
|
||||
|
||||
@priority.setter
|
||||
def priority(self, value: float):
|
||||
self._priority = value
|
||||
|
||||
@priority.deleter
|
||||
def priority(self):
|
||||
raise AttributeError("Cannot delete the priority attribute")
|
||||
|
|
@ -6,8 +6,7 @@ from typing import Union
|
|||
from urllib.parse import parse_qs, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@ from typing import Any, Union
|
|||
import re
|
||||
import sys
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@ import sys
|
|||
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
from typing import Any, Union
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Union
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
import base64
|
||||
import mimetypes
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
import json
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from .._exceptions import FileConversionException
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import shutil
|
|||
import json
|
||||
from warnings import warn
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverter
|
||||
|
||||
|
||||
class MediaConverter(DocumentConverter):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
import tempfile
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._wav_converter import WavConverter
|
||||
from warnings import resetwarnings, catch_warnings
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
import sys
|
||||
from typing import Any, Union
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
import sys
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@ import mimetypes
|
|||
from charset_normalizer import from_path
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
# Mimetypes to ignore (commonly confused extensions)
|
||||
|
|
|
|||
|
|
@ -5,8 +5,7 @@ import sys
|
|||
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@ from typing import Union
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
class RssConverter(DocumentConverter):
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Union
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
|
||||
# Optional Transcription support
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@ import re
|
|||
from typing import Any, Union
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@ import sys
|
|||
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
|
|
|
|||
|
|
@ -7,8 +7,7 @@ from typing import Any, Union, Dict, List
|
|||
from urllib.parse import parse_qs, urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
# Optional YouTube transcription support
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@ import zipfile
|
|||
import shutil
|
||||
from typing import Any, Union
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
class ZipConverter(DocumentConverter):
|
||||
|
|
|
|||
Loading…
Reference in a new issue