More progress.
This commit is contained in:
parent
7bc6d827ee
commit
4129f30c23
23 changed files with 149 additions and 201 deletions
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
from ._markitdown import MarkItDown
|
from ._markitdown import MarkItDown
|
||||||
from ._base_converter import DocumentConverterResult, BaseDocumentConverter
|
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItDownException,
|
MarkItDownException,
|
||||||
|
|
@ -13,13 +13,11 @@ from ._exceptions import (
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
)
|
)
|
||||||
from .converters import DocumentConverter
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"__version__",
|
"__version__",
|
||||||
"MarkItDown",
|
"MarkItDown",
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
"BaseDocumentConverter",
|
|
||||||
"DocumentConverterResult",
|
"DocumentConverterResult",
|
||||||
"MarkItDownException",
|
"MarkItDownException",
|
||||||
"MissingDependencyException",
|
"MissingDependencyException",
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,11 @@
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from warnings import warn
|
||||||
|
from typing import Any, Union, BinaryIO, Optional, List
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo
|
||||||
from typing import Any, Union, BinaryIO, Optional
|
|
||||||
|
# Avoid printing the same warning multiple times
|
||||||
|
_WARNED: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverterResult:
|
class DocumentConverterResult:
|
||||||
|
|
@ -39,7 +45,7 @@ class DocumentConverterResult:
|
||||||
return self.markdown
|
return self.markdown
|
||||||
|
|
||||||
|
|
||||||
class BaseDocumentConverter:
|
class DocumentConverter:
|
||||||
"""Abstract superclass of all DocumentConverters."""
|
"""Abstract superclass of all DocumentConverters."""
|
||||||
|
|
||||||
# Lower priority values are tried first.
|
# Lower priority values are tried first.
|
||||||
|
|
@ -74,7 +80,7 @@ class BaseDocumentConverter:
|
||||||
"""
|
"""
|
||||||
self._priority = priority
|
self._priority = priority
|
||||||
|
|
||||||
def convert(
|
def convert_stream(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
|
|
@ -106,6 +112,61 @@ class BaseDocumentConverter:
|
||||||
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
||||||
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Default implementation ensures backward compatibility with the legacy convert() method, and
|
||||||
|
# should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
|
||||||
|
# in the future.
|
||||||
|
result = None
|
||||||
|
used_legacy = False
|
||||||
|
|
||||||
|
if stream_info.local_path is not None and os.path.exists(
|
||||||
|
stream_info.local_path
|
||||||
|
):
|
||||||
|
# If the stream is backed by a local file, pass it to the legacy convert() method
|
||||||
|
try:
|
||||||
|
result = self.convert(stream_info.local_path, **kwargs)
|
||||||
|
used_legacy = True
|
||||||
|
except (
|
||||||
|
NotImplementedError
|
||||||
|
): # If it wasn't implemented, rethrow the error, but with this as the stack trace
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Subclasses must implement the convert_stream method."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Otherwise, we need to read the stream into a temporary file. There is potential for
|
||||||
|
# thrashing here if there are many converters or conversion attempts
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
|
temp_fd, temp_path = tempfile.mkstemp()
|
||||||
|
try:
|
||||||
|
with os.fdopen(temp_fd, "wb") as temp_file:
|
||||||
|
temp_file.write(file_stream.read())
|
||||||
|
try:
|
||||||
|
result = self.convert(temp_path, **kwargs)
|
||||||
|
used_legacy = True
|
||||||
|
except NotImplementedError:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Subclasses must implement the convert_stream method."
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
os.remove(temp_path)
|
||||||
|
file_stream.seek(0)
|
||||||
|
|
||||||
|
if used_legacy:
|
||||||
|
message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
|
||||||
|
if message not in _WARNED:
|
||||||
|
warn(message, DeprecationWarning)
|
||||||
|
_WARNED.append(message)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
"""
|
||||||
|
Legacy, and deprecated method to convert a document to Markdown text.
|
||||||
|
This method reads from the file at `local_path` and returns the converted Markdown text.
|
||||||
|
This method is deprecated in favor of `convert_stream`, which uses a file-like object.
|
||||||
|
"""
|
||||||
raise NotImplementedError("Subclasses must implement this method")
|
raise NotImplementedError("Subclasses must implement this method")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,6 @@ import requests
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
DocumentConverter,
|
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
HtmlConverter,
|
HtmlConverter,
|
||||||
RssConverter,
|
RssConverter,
|
||||||
|
|
@ -41,7 +40,7 @@ from .converters import (
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ._base_converter import DocumentConverterResult
|
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
|
|
@ -102,7 +101,7 @@ class MarkItDown:
|
||||||
self._style_map = None
|
self._style_map = None
|
||||||
|
|
||||||
# Register the converters
|
# Register the converters
|
||||||
self._page_converters: List[DocumentConverter] = []
|
self._converters: List[DocumentConverter] = []
|
||||||
|
|
||||||
if (
|
if (
|
||||||
enable_builtins is None or enable_builtins
|
enable_builtins is None or enable_builtins
|
||||||
|
|
@ -405,26 +404,6 @@ class MarkItDown:
|
||||||
def _convert(
|
def _convert(
|
||||||
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Lazily create a temporary file, if needed, for backward compatibility
|
|
||||||
# This is to support a deprecated feature, and will be removed in the future
|
|
||||||
temp_file = None
|
|
||||||
|
|
||||||
def get_temp_file():
|
|
||||||
nonlocal temp_file
|
|
||||||
|
|
||||||
if temp_file is not None:
|
|
||||||
return temp_file
|
|
||||||
else:
|
|
||||||
cur_pos = file_stream.tell()
|
|
||||||
handle, temp_file = tempfile.mkstemp()
|
|
||||||
fh = os.fdopen(handle, "wb")
|
|
||||||
file_stream.seek(0)
|
|
||||||
fh.write(file_stream.read())
|
|
||||||
file_stream.seek(cur_pos)
|
|
||||||
fh.close()
|
|
||||||
return temp_file
|
|
||||||
|
|
||||||
try:
|
|
||||||
res: Union[None, DocumentConverterResult] = None
|
res: Union[None, DocumentConverterResult] = None
|
||||||
|
|
||||||
# Keep track of which converters throw exceptions
|
# Keep track of which converters throw exceptions
|
||||||
|
|
@ -433,9 +412,9 @@ class MarkItDown:
|
||||||
# Create a copy of the page_converters list, sorted by priority.
|
# Create a copy of the page_converters list, sorted by priority.
|
||||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||||
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
|
sorted_converters = sorted(self._converters, key=lambda x: x.priority)
|
||||||
|
|
||||||
for file_info in stream_info_guesses + [None]:
|
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||||
for converter in sorted_converters:
|
for converter in sorted_converters:
|
||||||
_kwargs = copy.deepcopy(kwargs)
|
_kwargs = copy.deepcopy(kwargs)
|
||||||
|
|
||||||
|
|
@ -449,44 +428,37 @@ class MarkItDown:
|
||||||
if "style_map" not in _kwargs and self._style_map is not None:
|
if "style_map" not in _kwargs and self._style_map is not None:
|
||||||
_kwargs["style_map"] = self._style_map
|
_kwargs["style_map"] = self._style_map
|
||||||
|
|
||||||
if (
|
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
|
||||||
"exiftool_path" not in _kwargs
|
|
||||||
and self._exiftool_path is not None
|
|
||||||
):
|
|
||||||
_kwargs["exiftool_path"] = self._exiftool_path
|
_kwargs["exiftool_path"] = self._exiftool_path
|
||||||
|
|
||||||
# Add the list of converters for nested processing
|
# Add the list of converters for nested processing
|
||||||
_kwargs["_parent_converters"] = self._page_converters
|
_kwargs["_parent_converters"] = self._converters
|
||||||
|
|
||||||
# Add backwards compatibility
|
# Add legaxy kwargs
|
||||||
if isinstance(converter, DocumentConverter):
|
if stream_info is not None:
|
||||||
if file_info is not None:
|
if stream_info.extension is not None:
|
||||||
# Legacy converters need a file_extension
|
_kwargs["file_extension"] = stream_info.extension
|
||||||
if file_info.extension is not None:
|
|
||||||
_kwargs["file_extension"] = file_info.extension
|
|
||||||
|
|
||||||
# And benefit from urls, when available
|
if stream_info.url is not None:
|
||||||
if file_info.url is not None:
|
_kwargs["url"] = stream_info.url
|
||||||
_kwargs["url"] = file_info.url
|
|
||||||
|
|
||||||
|
# Attempt the conversion
|
||||||
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
res = converter.convert(get_temp_file(), **_kwargs)
|
res = converter.convert_stream(file_stream, stream_info, **_kwargs)
|
||||||
except Exception:
|
except Exception:
|
||||||
failed_attempts.append(
|
failed_attempts.append(
|
||||||
FailedConversionAttempt(
|
FailedConversionAttempt(
|
||||||
converter=converter, exc_info=sys.exc_info()
|
converter=converter, exc_info=sys.exc_info()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
finally:
|
||||||
raise NotImplementedError("TODO")
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
# Normalize the content
|
# Normalize the content
|
||||||
res.text_content = "\n".join(
|
res.text_content = "\n".join(
|
||||||
[
|
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||||
line.rstrip()
|
|
||||||
for line in re.split(r"\r?\n", res.text_content)
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||||
return res
|
return res
|
||||||
|
|
@ -500,14 +472,6 @@ class MarkItDown:
|
||||||
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
finally:
|
|
||||||
# Clean up the temporary file
|
|
||||||
if temp_file is not None:
|
|
||||||
try:
|
|
||||||
os.unlink(temp_file)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||||
"""DEPRECATED: User register_converter instead."""
|
"""DEPRECATED: User register_converter instead."""
|
||||||
warn(
|
warn(
|
||||||
|
|
@ -516,6 +480,6 @@ class MarkItDown:
|
||||||
)
|
)
|
||||||
self.register_converter(converter)
|
self.register_converter(converter)
|
||||||
|
|
||||||
def register_converter(self, converter: DocumentConverter) -> None:
|
def register_converter(self, converter: Union[DocumentConverter]) -> None:
|
||||||
"""Register a page text converter."""
|
"""Register a page text converter."""
|
||||||
self._page_converters.insert(0, converter)
|
self._converters.insert(0, converter)
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
|
||||||
from ._plain_text_converter import PlainTextConverter
|
from ._plain_text_converter import PlainTextConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from ._rss_converter import RssConverter
|
from ._rss_converter import RssConverter
|
||||||
|
|
@ -22,7 +21,6 @@ from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DocumentConverter",
|
|
||||||
"PlainTextConverter",
|
"PlainTextConverter",
|
||||||
"HtmlConverter",
|
"HtmlConverter",
|
||||||
"RssConverter",
|
"RssConverter",
|
||||||
|
|
|
||||||
|
|
@ -1,56 +0,0 @@
|
||||||
from typing import Any, Union
|
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverter:
|
|
||||||
"""Abstract superclass of all DocumentConverters."""
|
|
||||||
|
|
||||||
# Lower priority values are tried first.
|
|
||||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
|
||||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
|
||||||
)
|
|
||||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
|
||||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
|
||||||
"""
|
|
||||||
Initialize the DocumentConverter with a given priority.
|
|
||||||
|
|
||||||
Priorities work as follows: By default, most converters get priority
|
|
||||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
|
||||||
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
|
||||||
with lower values being tried first (i.e., higher priority).
|
|
||||||
|
|
||||||
Just prior to conversion, the converters are sorted by priority, using
|
|
||||||
a stable sort. This means that converters with the same priority will
|
|
||||||
remain in the same order, with the most recently registered converters
|
|
||||||
appearing first.
|
|
||||||
|
|
||||||
We have tight control over the order of built-in converters, but
|
|
||||||
plugins can register converters in any order. A converter's priority
|
|
||||||
field reasserts some control over the order of converters.
|
|
||||||
|
|
||||||
Plugins can register converters with any priority, to appear before or
|
|
||||||
after the built-ins. For example, a plugin with priority 9 will run
|
|
||||||
before the PlainTextConverter, but after the built-in converters.
|
|
||||||
"""
|
|
||||||
self._priority = priority
|
|
||||||
|
|
||||||
def convert(
|
|
||||||
self, local_path: str, **kwargs: Any
|
|
||||||
) -> Union[None, DocumentConverterResult]:
|
|
||||||
raise NotImplementedError("Subclasses must implement this method")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def priority(self) -> float:
|
|
||||||
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
|
|
||||||
return self._priority
|
|
||||||
|
|
||||||
@priority.setter
|
|
||||||
def priority(self, value: float):
|
|
||||||
self._priority = value
|
|
||||||
|
|
||||||
@priority.deleter
|
|
||||||
def priority(self):
|
|
||||||
raise AttributeError("Cannot delete the priority attribute")
|
|
||||||
|
|
@ -6,8 +6,7 @@ from typing import Union
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,7 @@ from typing import Any, Union
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from .._exceptions import MissingDependencyException
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,7 @@ import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,7 @@
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
import base64
|
import base64
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,7 @@
|
||||||
import json
|
import json
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
|
|
||||||
from .._exceptions import FileConversionException
|
from .._exceptions import FileConversionException
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ import shutil
|
||||||
import json
|
import json
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter
|
||||||
|
|
||||||
|
|
||||||
class MediaConverter(DocumentConverter):
|
class MediaConverter(DocumentConverter):
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._wav_converter import WavConverter
|
from ._wav_converter import WavConverter
|
||||||
from warnings import resetwarnings, catch_warnings
|
from warnings import resetwarnings, catch_warnings
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,7 @@ import mimetypes
|
||||||
from charset_normalizer import from_path
|
from charset_normalizer import from_path
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
|
|
||||||
|
|
||||||
# Mimetypes to ignore (commonly confused extensions)
|
# Mimetypes to ignore (commonly confused extensions)
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,7 @@ import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,7 @@ from typing import Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
|
|
||||||
|
|
||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,7 @@ import re
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,7 @@ import sys
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,7 @@ from typing import Any, Union, Dict, List
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,7 @@ import zipfile
|
||||||
import shutil
|
import shutil
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._base_converter import DocumentConverterResult
|
|
||||||
|
|
||||||
|
|
||||||
class ZipConverter(DocumentConverter):
|
class ZipConverter(DocumentConverter):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue