More progress.

2025-03-04 00:52:57 -08:00 · 2025-03-04 00:52:57 -08:00 · 4129f30c23
commit 4129f30c23
parent 7bc6d827ee
23 changed files with 149 additions and 201 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -4,7 +4,7 @@

 from .__about__ import __version__
 from ._markitdown import MarkItDown
-from ._base_converter import DocumentConverterResult, BaseDocumentConverter
+from ._base_converter import DocumentConverterResult, DocumentConverter
 from ._stream_info import StreamInfo
 from ._exceptions import (
    MarkItDownException,
@ -13,13 +13,11 @@ from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
 )
-from .converters import DocumentConverter

 __all__ = [
    "__version__",
    "MarkItDown",
    "DocumentConverter",
-    "BaseDocumentConverter",
    "DocumentConverterResult",
    "MarkItDownException",
    "MissingDependencyException",
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -1,5 +1,11 @@
+import os
+import tempfile
+from warnings import warn
+from typing import Any, Union, BinaryIO, Optional, List
 from ._stream_info import StreamInfo
-from typing import Any, Union, BinaryIO, Optional
+
+# Avoid printing the same warning multiple times
+_WARNED: List[str] = []


 class DocumentConverterResult:
@ -39,7 +45,7 @@ class DocumentConverterResult:
        return self.markdown


-class BaseDocumentConverter:
+class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""

    # Lower priority values are tried first.
@ -74,7 +80,7 @@ class BaseDocumentConverter:
        """
        self._priority = priority

-    def convert(
+    def convert_stream(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
@ -106,6 +112,61 @@ class BaseDocumentConverter:
        - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
        - MissingDependencyException: If the converter requires a dependency that is not installed.
        """
+
+        # Default implementation ensures backward compatibility with the legacy convert() method, and
+        # should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
+        # in the future.
+        result = None
+        used_legacy = False
+
+        if stream_info.local_path is not None and os.path.exists(
+            stream_info.local_path
+        ):
+            # If the stream is backed by a local file, pass it to the legacy convert() method
+            try:
+                result = self.convert(stream_info.local_path, **kwargs)
+                used_legacy = True
+            except (
+                NotImplementedError
+            ):  # If it wasn't implemented, rethrow the error, but with this as the stack trace
+                raise NotImplementedError(
+                    "Subclasses must implement the convert_stream method."
+                )
+        else:
+            # Otherwise, we need to read the stream into a temporary file. There is potential for
+            # thrashing here if there are many converters or conversion attempts
+            cur_pos = file_stream.tell()
+            temp_fd, temp_path = tempfile.mkstemp()
+            try:
+                with os.fdopen(temp_fd, "wb") as temp_file:
+                    temp_file.write(file_stream.read())
+                try:
+                    result = self.convert(temp_path, **kwargs)
+                    used_legacy = True
+                except NotImplementedError:
+                    raise NotImplementedError(
+                        "Subclasses must implement the convert_stream method."
+                    )
+            finally:
+                os.remove(temp_path)
+                file_stream.seek(0)
+
+        if used_legacy:
+            message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
+            if message not in _WARNED:
+                warn(message, DeprecationWarning)
+                _WARNED.append(message)
+
+        return result
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        """
+        Legacy, and deprecated method to convert a document to Markdown text.
+        This method reads from the file at `local_path` and returns the converted Markdown text.
+        This method is deprecated in favor of `convert_stream`, which uses a file-like object.
+        """
        raise NotImplementedError("Subclasses must implement this method")

    @property
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -20,7 +20,6 @@ import requests
 from ._stream_info import StreamInfo

 from .converters import (
-    DocumentConverter,
    PlainTextConverter,
    HtmlConverter,
    RssConverter,
@ -41,7 +40,7 @@ from .converters import (
    DocumentIntelligenceConverter,
 )

-from ._base_converter import DocumentConverterResult
+from ._base_converter import DocumentConverter, DocumentConverterResult

 from ._exceptions import (
    FileConversionException,
@ -102,7 +101,7 @@ class MarkItDown:
        self._style_map = None

        # Register the converters
-        self._page_converters: List[DocumentConverter] = []
+        self._converters: List[DocumentConverter] = []

        if (
            enable_builtins is None or enable_builtins
@ -405,26 +404,6 @@ class MarkItDown:
    def _convert(
        self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
    ) -> DocumentConverterResult:
-        # Lazily create a temporary file, if needed, for backward compatibility
-        # This is to support a deprecated feature, and will be removed in the future
-        temp_file = None
-
-        def get_temp_file():
-            nonlocal temp_file
-
-            if temp_file is not None:
-                return temp_file
-            else:
-                cur_pos = file_stream.tell()
-                handle, temp_file = tempfile.mkstemp()
-                fh = os.fdopen(handle, "wb")
-                file_stream.seek(0)
-                fh.write(file_stream.read())
-                file_stream.seek(cur_pos)
-                fh.close()
-            return temp_file
-
-        try:
        res: Union[None, DocumentConverterResult] = None

        # Keep track of which converters throw exceptions
@ -433,9 +412,9 @@ class MarkItDown:
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
-            sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
+        sorted_converters = sorted(self._converters, key=lambda x: x.priority)

-            for file_info in stream_info_guesses + [None]:
+        for stream_info in stream_info_guesses + [StreamInfo()]:
            for converter in sorted_converters:
                _kwargs = copy.deepcopy(kwargs)

@ -449,44 +428,37 @@ class MarkItDown:
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map

-                    if (
-                        "exiftool_path" not in _kwargs
-                        and self._exiftool_path is not None
-                    ):
+                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
                    _kwargs["exiftool_path"] = self._exiftool_path

                # Add the list of converters for nested processing
-                    _kwargs["_parent_converters"] = self._page_converters
+                _kwargs["_parent_converters"] = self._converters

-                    # Add backwards compatibility
-                    if isinstance(converter, DocumentConverter):
-                        if file_info is not None:
-                            # Legacy converters need a file_extension
-                            if file_info.extension is not None:
-                                _kwargs["file_extension"] = file_info.extension
+                # Add legaxy kwargs
+                if stream_info is not None:
+                    if stream_info.extension is not None:
+                        _kwargs["file_extension"] = stream_info.extension

-                            # And benefit from urls, when available
-                            if file_info.url is not None:
-                                _kwargs["url"] = file_info.url
+                    if stream_info.url is not None:
+                        _kwargs["url"] = stream_info.url

+                # Attempt the conversion
+                cur_pos = file_stream.tell()
                try:
-                            res = converter.convert(get_temp_file(), **_kwargs)
+                    res = converter.convert_stream(file_stream, stream_info, **_kwargs)
                except Exception:
                    failed_attempts.append(
                        FailedConversionAttempt(
                            converter=converter, exc_info=sys.exc_info()
                        )
                    )
-                    else:
-                        raise NotImplementedError("TODO")
+                finally:
+                    file_stream.seek(cur_pos)

                if res is not None:
                    # Normalize the content
                    res.text_content = "\n".join(
-                            [
-                                line.rstrip()
-                                for line in re.split(r"\r?\n", res.text_content)
-                            ]
+                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
                    )
                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
                    return res
@ -500,14 +472,6 @@ class MarkItDown:
            f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
        )

-        finally:
-            # Clean up the temporary file
-            if temp_file is not None:
-                try:
-                    os.unlink(temp_file)
-                except Exception:
-                    pass
-
    def register_page_converter(self, converter: DocumentConverter) -> None:
        """DEPRECATED: User register_converter instead."""
        warn(
@ -516,6 +480,6 @@ class MarkItDown:
        )
        self.register_converter(converter)

-    def register_converter(self, converter: DocumentConverter) -> None:
+    def register_converter(self, converter: Union[DocumentConverter]) -> None:
        """Register a page text converter."""
-        self._page_converters.insert(0, converter)
+        self._converters.insert(0, converter)
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: MIT

-from ._base import DocumentConverter
 from ._plain_text_converter import PlainTextConverter
 from ._html_converter import HtmlConverter
 from ._rss_converter import RssConverter
@ -22,7 +21,6 @@ from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter

 __all__ = [
-    "DocumentConverter",
    "PlainTextConverter",
    "HtmlConverter",
    "RssConverter",
--- a/packages/markitdown/src/markitdown/converters/_base.py
+++ b/packages/markitdown/src/markitdown/converters/_base.py
@ -1,56 +0,0 @@
-from typing import Any, Union
-from .._base_converter import DocumentConverterResult
-
-
-class DocumentConverter:
-    """Abstract superclass of all DocumentConverters."""
-
-    # Lower priority values are tried first.
-    PRIORITY_SPECIFIC_FILE_FORMAT = (
-        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
-    )
-    PRIORITY_GENERIC_FILE_FORMAT = (
-        10.0  # Near catch-all converters for mimetypes like text/*, etc.
-    )
-
-    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
-        """
-        Initialize the DocumentConverter with a given priority.
-
-        Priorities work as follows: By default, most converters get priority
-        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
-        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
-        with lower values being tried first (i.e., higher priority).
-
-        Just prior to conversion, the converters are sorted by priority, using
-        a stable sort. This means that converters with the same priority will
-        remain in the same order, with the most recently registered converters
-        appearing first.
-
-        We have tight control over the order of built-in converters, but
-        plugins can register converters in any order. A converter's priority
-        field reasserts some control over the order of converters.
-
-        Plugins can register converters with any priority, to appear before or
-        after the built-ins. For example, a plugin with priority 9 will run
-        before the PlainTextConverter, but after the built-in converters.
-        """
-        self._priority = priority
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        raise NotImplementedError("Subclasses must implement this method")
-
-    @property
-    def priority(self) -> float:
-        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
-        return self._priority
-
-    @priority.setter
-    def priority(self, value: float):
-        self._priority = value
-
-    @priority.deleter
-    def priority(self):
-        raise AttributeError("Cannot delete the priority attribute")
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -6,8 +6,7 @@ from typing import Union
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify


--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -2,8 +2,7 @@ from typing import Any, Union
 import re
 import sys

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException

 # Try loading optional (but in this case, required) dependencies
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -2,8 +2,7 @@ import sys

 from typing import Union

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -1,8 +1,7 @@
 from typing import Any, Union
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify


--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,6 +1,5 @@
 from typing import Union
-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
 import base64
 import mimetypes
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -1,8 +1,7 @@
 import json
 from typing import Any, Union

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult

 from .._exceptions import FileConversionException

--- a/packages/markitdown/src/markitdown/converters/_media_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_media_converter.py
@ -3,7 +3,7 @@ import shutil
 import json
 from warnings import warn

-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter


 class MediaConverter(DocumentConverter):
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@ -1,7 +1,6 @@
 import tempfile
 from typing import Union
-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings

--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -1,7 +1,6 @@
 import sys
 from typing import Any, Union
-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 # Try loading optional (but in this case, required) dependencies
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -1,7 +1,6 @@
 import sys
 from typing import Union
-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 # Try loading optional (but in this case, required) dependencies
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -3,8 +3,7 @@ import mimetypes
 from charset_normalizer import from_path
 from typing import Any, Union

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult


 # Mimetypes to ignore (commonly confused extensions)
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -5,8 +5,7 @@ import sys

 from typing import Union

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -3,8 +3,7 @@ from typing import Union
 from bs4 import BeautifulSoup

 from ._markdownify import _CustomMarkdownify
-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult


 class RssConverter(DocumentConverter):
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@ -1,6 +1,5 @@
 from typing import Union
-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter

 # Optional Transcription support
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -3,8 +3,7 @@ import re
 from typing import Any, Union
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify


--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -2,8 +2,7 @@ import sys

 from typing import Union

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -7,8 +7,7 @@ from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult


 # Optional YouTube transcription support
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -3,8 +3,7 @@ import zipfile
 import shutil
 from typing import Any, Union

-from ._base import DocumentConverter
-from .._base_converter import DocumentConverterResult
+from .._base_converter import DocumentConverter, DocumentConverterResult


 class ZipConverter(DocumentConverter):