More progress.

This commit is contained in:
Adam Fourney 2025-03-04 00:52:57 -08:00
parent 7bc6d827ee
commit 4129f30c23
23 changed files with 149 additions and 201 deletions

View file

@ -4,7 +4,7 @@
from .__about__ import __version__ from .__about__ import __version__
from ._markitdown import MarkItDown from ._markitdown import MarkItDown
from ._base_converter import DocumentConverterResult, BaseDocumentConverter from ._base_converter import DocumentConverterResult, DocumentConverter
from ._stream_info import StreamInfo from ._stream_info import StreamInfo
from ._exceptions import ( from ._exceptions import (
MarkItDownException, MarkItDownException,
@ -13,13 +13,11 @@ from ._exceptions import (
FileConversionException, FileConversionException,
UnsupportedFormatException, UnsupportedFormatException,
) )
from .converters import DocumentConverter
__all__ = [ __all__ = [
"__version__", "__version__",
"MarkItDown", "MarkItDown",
"DocumentConverter", "DocumentConverter",
"BaseDocumentConverter",
"DocumentConverterResult", "DocumentConverterResult",
"MarkItDownException", "MarkItDownException",
"MissingDependencyException", "MissingDependencyException",

View file

@ -1,5 +1,11 @@
import os
import tempfile
from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List
from ._stream_info import StreamInfo from ._stream_info import StreamInfo
from typing import Any, Union, BinaryIO, Optional
# Avoid printing the same warning multiple times
_WARNED: List[str] = []
class DocumentConverterResult: class DocumentConverterResult:
@ -39,7 +45,7 @@ class DocumentConverterResult:
return self.markdown return self.markdown
class BaseDocumentConverter: class DocumentConverter:
"""Abstract superclass of all DocumentConverters.""" """Abstract superclass of all DocumentConverters."""
# Lower priority values are tried first. # Lower priority values are tried first.
@ -74,7 +80,7 @@ class BaseDocumentConverter:
""" """
self._priority = priority self._priority = priority
def convert( def convert_stream(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
@ -106,6 +112,61 @@ class BaseDocumentConverter:
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
- MissingDependencyException: If the converter requires a dependency that is not installed. - MissingDependencyException: If the converter requires a dependency that is not installed.
""" """
# Default implementation ensures backward compatibility with the legacy convert() method, and
# should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
# in the future.
result = None
used_legacy = False
if stream_info.local_path is not None and os.path.exists(
stream_info.local_path
):
# If the stream is backed by a local file, pass it to the legacy convert() method
try:
result = self.convert(stream_info.local_path, **kwargs)
used_legacy = True
except (
NotImplementedError
): # If it wasn't implemented, rethrow the error, but with this as the stack trace
raise NotImplementedError(
"Subclasses must implement the convert_stream method."
)
else:
# Otherwise, we need to read the stream into a temporary file. There is potential for
# thrashing here if there are many converters or conversion attempts
cur_pos = file_stream.tell()
temp_fd, temp_path = tempfile.mkstemp()
try:
with os.fdopen(temp_fd, "wb") as temp_file:
temp_file.write(file_stream.read())
try:
result = self.convert(temp_path, **kwargs)
used_legacy = True
except NotImplementedError:
raise NotImplementedError(
"Subclasses must implement the convert_stream method."
)
finally:
os.remove(temp_path)
file_stream.seek(0)
if used_legacy:
message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
if message not in _WARNED:
warn(message, DeprecationWarning)
_WARNED.append(message)
return result
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
"""
Legacy, and deprecated method to convert a document to Markdown text.
This method reads from the file at `local_path` and returns the converted Markdown text.
This method is deprecated in favor of `convert_stream`, which uses a file-like object.
"""
raise NotImplementedError("Subclasses must implement this method") raise NotImplementedError("Subclasses must implement this method")
@property @property

View file

@ -20,7 +20,6 @@ import requests
from ._stream_info import StreamInfo from ._stream_info import StreamInfo
from .converters import ( from .converters import (
DocumentConverter,
PlainTextConverter, PlainTextConverter,
HtmlConverter, HtmlConverter,
RssConverter, RssConverter,
@ -41,7 +40,7 @@ from .converters import (
DocumentIntelligenceConverter, DocumentIntelligenceConverter,
) )
from ._base_converter import DocumentConverterResult from ._base_converter import DocumentConverter, DocumentConverterResult
from ._exceptions import ( from ._exceptions import (
FileConversionException, FileConversionException,
@ -102,7 +101,7 @@ class MarkItDown:
self._style_map = None self._style_map = None
# Register the converters # Register the converters
self._page_converters: List[DocumentConverter] = [] self._converters: List[DocumentConverter] = []
if ( if (
enable_builtins is None or enable_builtins enable_builtins is None or enable_builtins
@ -405,26 +404,6 @@ class MarkItDown:
def _convert( def _convert(
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Lazily create a temporary file, if needed, for backward compatibility
# This is to support a deprecated feature, and will be removed in the future
temp_file = None
def get_temp_file():
nonlocal temp_file
if temp_file is not None:
return temp_file
else:
cur_pos = file_stream.tell()
handle, temp_file = tempfile.mkstemp()
fh = os.fdopen(handle, "wb")
file_stream.seek(0)
fh.write(file_stream.read())
file_stream.seek(cur_pos)
fh.close()
return temp_file
try:
res: Union[None, DocumentConverterResult] = None res: Union[None, DocumentConverterResult] = None
# Keep track of which converters throw exceptions # Keep track of which converters throw exceptions
@ -433,9 +412,9 @@ class MarkItDown:
# Create a copy of the page_converters list, sorted by priority. # Create a copy of the page_converters list, sorted by priority.
# We do this with each call to _convert because the priority of converters may change between calls. # We do this with each call to _convert because the priority of converters may change between calls.
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
sorted_converters = sorted(self._page_converters, key=lambda x: x.priority) sorted_converters = sorted(self._converters, key=lambda x: x.priority)
for file_info in stream_info_guesses + [None]: for stream_info in stream_info_guesses + [StreamInfo()]:
for converter in sorted_converters: for converter in sorted_converters:
_kwargs = copy.deepcopy(kwargs) _kwargs = copy.deepcopy(kwargs)
@ -449,44 +428,37 @@ class MarkItDown:
if "style_map" not in _kwargs and self._style_map is not None: if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map _kwargs["style_map"] = self._style_map
if ( if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
"exiftool_path" not in _kwargs
and self._exiftool_path is not None
):
_kwargs["exiftool_path"] = self._exiftool_path _kwargs["exiftool_path"] = self._exiftool_path
# Add the list of converters for nested processing # Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters _kwargs["_parent_converters"] = self._converters
# Add backwards compatibility # Add legaxy kwargs
if isinstance(converter, DocumentConverter): if stream_info is not None:
if file_info is not None: if stream_info.extension is not None:
# Legacy converters need a file_extension _kwargs["file_extension"] = stream_info.extension
if file_info.extension is not None:
_kwargs["file_extension"] = file_info.extension
# And benefit from urls, when available if stream_info.url is not None:
if file_info.url is not None: _kwargs["url"] = stream_info.url
_kwargs["url"] = file_info.url
# Attempt the conversion
cur_pos = file_stream.tell()
try: try:
res = converter.convert(get_temp_file(), **_kwargs) res = converter.convert_stream(file_stream, stream_info, **_kwargs)
except Exception: except Exception:
failed_attempts.append( failed_attempts.append(
FailedConversionAttempt( FailedConversionAttempt(
converter=converter, exc_info=sys.exc_info() converter=converter, exc_info=sys.exc_info()
) )
) )
else: finally:
raise NotImplementedError("TODO") file_stream.seek(cur_pos)
if res is not None: if res is not None:
# Normalize the content # Normalize the content
res.text_content = "\n".join( res.text_content = "\n".join(
[ [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
line.rstrip()
for line in re.split(r"\r?\n", res.text_content)
]
) )
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
return res return res
@ -500,14 +472,6 @@ class MarkItDown:
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
) )
finally:
# Clean up the temporary file
if temp_file is not None:
try:
os.unlink(temp_file)
except Exception:
pass
def register_page_converter(self, converter: DocumentConverter) -> None: def register_page_converter(self, converter: DocumentConverter) -> None:
"""DEPRECATED: User register_converter instead.""" """DEPRECATED: User register_converter instead."""
warn( warn(
@ -516,6 +480,6 @@ class MarkItDown:
) )
self.register_converter(converter) self.register_converter(converter)
def register_converter(self, converter: DocumentConverter) -> None: def register_converter(self, converter: Union[DocumentConverter]) -> None:
"""Register a page text converter.""" """Register a page text converter."""
self._page_converters.insert(0, converter) self._converters.insert(0, converter)

View file

@ -2,7 +2,6 @@
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
from ._base import DocumentConverter
from ._plain_text_converter import PlainTextConverter from ._plain_text_converter import PlainTextConverter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._rss_converter import RssConverter from ._rss_converter import RssConverter
@ -22,7 +21,6 @@ from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import DocumentIntelligenceConverter
__all__ = [ __all__ = [
"DocumentConverter",
"PlainTextConverter", "PlainTextConverter",
"HtmlConverter", "HtmlConverter",
"RssConverter", "RssConverter",

View file

@ -1,56 +0,0 @@
from typing import Any, Union
from .._base_converter import DocumentConverterResult
class DocumentConverter:
"""Abstract superclass of all DocumentConverters."""
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
"""
Initialize the DocumentConverter with a given priority.
Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
with lower values being tried first (i.e., higher priority).
Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.
We have tight control over the order of built-in converters, but
plugins can register converters in any order. A converter's priority
field reasserts some control over the order of converters.
Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._priority = priority
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
raise NotImplementedError("Subclasses must implement this method")
@property
def priority(self) -> float:
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
return self._priority
@priority.setter
def priority(self, value: float):
self._priority = value
@priority.deleter
def priority(self):
raise AttributeError("Cannot delete the priority attribute")

View file

@ -6,8 +6,7 @@ from typing import Union
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify

View file

@ -2,8 +2,7 @@ from typing import Any, Union
import re import re
import sys import sys
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from .._exceptions import MissingDependencyException from .._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies

View file

@ -2,8 +2,7 @@ import sys
from typing import Union from typing import Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

View file

@ -1,8 +1,7 @@
from typing import Any, Union from typing import Any, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify

View file

@ -1,6 +1,5 @@
from typing import Union from typing import Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
import base64 import base64
import mimetypes import mimetypes

View file

@ -1,8 +1,7 @@
import json import json
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from .._exceptions import FileConversionException from .._exceptions import FileConversionException

View file

@ -3,7 +3,7 @@ import shutil
import json import json
from warnings import warn from warnings import warn
from ._base import DocumentConverter from .._base_converter import DocumentConverter
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):

View file

@ -1,7 +1,6 @@
import tempfile import tempfile
from typing import Union from typing import Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._wav_converter import WavConverter from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings from warnings import resetwarnings, catch_warnings

View file

@ -1,7 +1,6 @@
import sys import sys
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies

View file

@ -1,7 +1,6 @@
import sys import sys
from typing import Union from typing import Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies

View file

@ -3,8 +3,7 @@ import mimetypes
from charset_normalizer import from_path from charset_normalizer import from_path
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
# Mimetypes to ignore (commonly confused extensions) # Mimetypes to ignore (commonly confused extensions)

View file

@ -5,8 +5,7 @@ import sys
from typing import Union from typing import Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

View file

@ -3,8 +3,7 @@ from typing import Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
class RssConverter(DocumentConverter): class RssConverter(DocumentConverter):

View file

@ -1,6 +1,5 @@
from typing import Union from typing import Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
# Optional Transcription support # Optional Transcription support

View file

@ -3,8 +3,7 @@ import re
from typing import Any, Union from typing import Any, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify

View file

@ -2,8 +2,7 @@ import sys
from typing import Union from typing import Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

View file

@ -7,8 +7,7 @@ from typing import Any, Union, Dict, List
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
# Optional YouTube transcription support # Optional YouTube transcription support

View file

@ -3,8 +3,7 @@ import zipfile
import shutil import shutil
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
class ZipConverter(DocumentConverter): class ZipConverter(DocumentConverter):