More progress.

2025-03-04 00:52:57 -08:00 · 2025-03-04 00:52:57 -08:00 · 4129f30c23
commit 4129f30c23
parent 7bc6d827ee
23 changed files with 149 additions and 201 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -4,7 +4,7 @@
 from .__about__ import __version__
 from ._markitdown import MarkItDown
-from ._base_converter import DocumentConverterResult, BaseDocumentConverter
+from ._base_converter import DocumentConverterResult, DocumentConverter
 from ._stream_info import StreamInfo
 from ._exceptions import (
    MarkItDownException,
@ -13,13 +13,11 @@ from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
 )
 from .converters import DocumentConverter
 __all__ = [
    "__version__",
    "MarkItDown",
    "DocumentConverter",
    "BaseDocumentConverter",
    "DocumentConverterResult",
    "MarkItDownException",
    "MissingDependencyException",
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -1,5 +1,11 @@
 import os
 import tempfile
 from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List
 from ._stream_info import StreamInfo
-from typing import Any, Union, BinaryIO, Optional
+
 # Avoid printing the same warning multiple times
 _WARNED: List[str] = []
 class DocumentConverterResult:
@ -39,7 +45,7 @@ class DocumentConverterResult:
        return self.markdown
-class BaseDocumentConverter:
+class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
    # Lower priority values are tried first.
@ -74,7 +80,7 @@ class BaseDocumentConverter:
        """
        self._priority = priority
-    def convert(
+    def convert_stream(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
@ -106,6 +112,61 @@ class BaseDocumentConverter:
        - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
        - MissingDependencyException: If the converter requires a dependency that is not installed.
        """
        # Default implementation ensures backward compatibility with the legacy convert() method, and
        # should absolutely be overridden in subclasses. This behavior is deprecated and will be removed
        # in the future.
        result = None
        used_legacy = False
        if stream_info.local_path is not None and os.path.exists(
            stream_info.local_path
        ):
            # If the stream is backed by a local file, pass it to the legacy convert() method
            try:
                result = self.convert(stream_info.local_path, **kwargs)
                used_legacy = True
            except (
                NotImplementedError
            ):  # If it wasn't implemented, rethrow the error, but with this as the stack trace
                raise NotImplementedError(
                    "Subclasses must implement the convert_stream method."
                )
        else:
            # Otherwise, we need to read the stream into a temporary file. There is potential for
            # thrashing here if there are many converters or conversion attempts
            cur_pos = file_stream.tell()
            temp_fd, temp_path = tempfile.mkstemp()
            try:
                with os.fdopen(temp_fd, "wb") as temp_file:
                    temp_file.write(file_stream.read())
                try:
                    result = self.convert(temp_path, **kwargs)
                    used_legacy = True
                except NotImplementedError:
                    raise NotImplementedError(
                        "Subclasses must implement the convert_stream method."
                    )
            finally:
                os.remove(temp_path)
                file_stream.seek(0)
        if used_legacy:
            message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated."
            if message not in _WARNED:
                warn(message, DeprecationWarning)
                _WARNED.append(message)
        return result
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        """
        Legacy, and deprecated method to convert a document to Markdown text.
        This method reads from the file at `local_path` and returns the converted Markdown text.
        This method is deprecated in favor of `convert_stream`, which uses a file-like object.
        """
        raise NotImplementedError("Subclasses must implement this method")
    @property
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -20,7 +20,6 @@ import requests
 from ._stream_info import StreamInfo
 from .converters import (
    DocumentConverter,
    PlainTextConverter,
    HtmlConverter,
    RssConverter,
@ -41,7 +40,7 @@ from .converters import (
    DocumentIntelligenceConverter,
 )
-from ._base_converter import DocumentConverterResult
+from ._base_converter import DocumentConverter, DocumentConverterResult
 from ._exceptions import (
    FileConversionException,
@ -102,7 +101,7 @@ class MarkItDown:
        self._style_map = None
        # Register the converters
-        self._page_converters: List[DocumentConverter] = []
+        self._converters: List[DocumentConverter] = []
        if (
            enable_builtins is None or enable_builtins
@ -405,26 +404,6 @@ class MarkItDown:
    def _convert(
        self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
    ) -> DocumentConverterResult:
        # Lazily create a temporary file, if needed, for backward compatibility
        # This is to support a deprecated feature, and will be removed in the future
        temp_file = None
        def get_temp_file():
            nonlocal temp_file
            if temp_file is not None:
                return temp_file
            else:
                cur_pos = file_stream.tell()
                handle, temp_file = tempfile.mkstemp()
                fh = os.fdopen(handle, "wb")
                file_stream.seek(0)
                fh.write(file_stream.read())
                file_stream.seek(cur_pos)
                fh.close()
            return temp_file
        try:
        res: Union[None, DocumentConverterResult] = None
        # Keep track of which converters throw exceptions
@ -433,9 +412,9 @@ class MarkItDown:
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
-            sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
+        sorted_converters = sorted(self._converters, key=lambda x: x.priority)
-            for file_info in stream_info_guesses + [None]:
+        for stream_info in stream_info_guesses + [StreamInfo()]:
            for converter in sorted_converters:
                _kwargs = copy.deepcopy(kwargs)
@ -449,44 +428,37 @@ class MarkItDown:
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map
-                    if (
+                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
                        "exiftool_path" not in _kwargs
                        and self._exiftool_path is not None
                    ):
                    _kwargs["exiftool_path"] = self._exiftool_path
                # Add the list of converters for nested processing
-                    _kwargs["_parent_converters"] = self._page_converters
+                _kwargs["_parent_converters"] = self._converters
-                    # Add backwards compatibility
+                # Add legaxy kwargs
-                    if isinstance(converter, DocumentConverter):
+                if stream_info is not None:
-                        if file_info is not None:
+                    if stream_info.extension is not None:
-                            # Legacy converters need a file_extension
+                        _kwargs["file_extension"] = stream_info.extension
                            if file_info.extension is not None:
                                _kwargs["file_extension"] = file_info.extension
-                            # And benefit from urls, when available
+                    if stream_info.url is not None:
-                            if file_info.url is not None:
+                        _kwargs["url"] = stream_info.url
                                _kwargs["url"] = file_info.url
                # Attempt the conversion
                cur_pos = file_stream.tell()
                try:
-                            res = converter.convert(get_temp_file(), **_kwargs)
+                    res = converter.convert_stream(file_stream, stream_info, **_kwargs)
                except Exception:
                    failed_attempts.append(
                        FailedConversionAttempt(
                            converter=converter, exc_info=sys.exc_info()
                        )
                    )
-                    else:
+                finally:
-                        raise NotImplementedError("TODO")
+                    file_stream.seek(cur_pos)
                if res is not None:
                    # Normalize the content
                    res.text_content = "\n".join(
-                            [
+                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
                                line.rstrip()
                                for line in re.split(r"\r?\n", res.text_content)
                            ]
                    )
                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
                    return res
@ -500,14 +472,6 @@ class MarkItDown:
            f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
        )
        finally:
            # Clean up the temporary file
            if temp_file is not None:
                try:
                    os.unlink(temp_file)
                except Exception:
                    pass
    def register_page_converter(self, converter: DocumentConverter) -> None:
        """DEPRECATED: User register_converter instead."""
        warn(
@ -516,6 +480,6 @@ class MarkItDown:
        )
        self.register_converter(converter)
-    def register_converter(self, converter: DocumentConverter) -> None:
+    def register_converter(self, converter: Union[DocumentConverter]) -> None:
        """Register a page text converter."""
-        self._page_converters.insert(0, converter)
+        self._converters.insert(0, converter)
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: MIT
 from ._base import DocumentConverter
 from ._plain_text_converter import PlainTextConverter
 from ._html_converter import HtmlConverter
 from ._rss_converter import RssConverter
@ -22,7 +21,6 @@ from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
 __all__ = [
    "DocumentConverter",
    "PlainTextConverter",
    "HtmlConverter",
    "RssConverter",
--- a/packages/markitdown/src/markitdown/converters/_base.py
+++ b/packages/markitdown/src/markitdown/converters/_base.py
@ -1,56 +0,0 @@
 from typing import Any, Union
 from .._base_converter import DocumentConverterResult
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
    # Lower priority values are tried first.
    PRIORITY_SPECIFIC_FILE_FORMAT = (
        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
    )
    PRIORITY_GENERIC_FILE_FORMAT = (
        10.0  # Near catch-all converters for mimetypes like text/*, etc.
    )
    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
        """
        Initialize the DocumentConverter with a given priority.
        Priorities work as follows: By default, most converters get priority
        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
        with lower values being tried first (i.e., higher priority).
        Just prior to conversion, the converters are sorted by priority, using
        a stable sort. This means that converters with the same priority will
        remain in the same order, with the most recently registered converters
        appearing first.
        We have tight control over the order of built-in converters, but
        plugins can register converters in any order. A converter's priority
        field reasserts some control over the order of converters.
        Plugins can register converters with any priority, to appear before or
        after the built-ins. For example, a plugin with priority 9 will run
        before the PlainTextConverter, but after the built-in converters.
        """
        self._priority = priority
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        raise NotImplementedError("Subclasses must implement this method")
    @property
    def priority(self) -> float:
        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
        return self._priority
    @priority.setter
    def priority(self, value: float):
        self._priority = value
    @priority.deleter
    def priority(self):
        raise AttributeError("Cannot delete the priority attribute")
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -6,8 +6,7 @@ from typing import Union
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -2,8 +2,7 @@ from typing import Any, Union
 import re
 import sys
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException
 # Try loading optional (but in this case, required) dependencies
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -2,8 +2,7 @@ import sys
 from typing import Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -1,8 +1,7 @@
 from typing import Any, Union
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,6 +1,5 @@
 from typing import Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._media_converter import MediaConverter
 import base64
 import mimetypes
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -1,8 +1,7 @@
 import json
 from typing import Any, Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from .._exceptions import FileConversionException
--- a/packages/markitdown/src/markitdown/converters/_media_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_media_converter.py
@ -3,7 +3,7 @@ import shutil
 import json
 from warnings import warn
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter
 class MediaConverter(DocumentConverter):
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@ -1,7 +1,6 @@
 import tempfile
 from typing import Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -1,7 +1,6 @@
 import sys
 from typing import Any, Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -1,7 +1,6 @@
 import sys
 from typing import Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -3,8 +3,7 @@ import mimetypes
 from charset_normalizer import from_path
 from typing import Any, Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 # Mimetypes to ignore (commonly confused extensions)
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -5,8 +5,7 @@ import sys
 from typing import Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -3,8 +3,7 @@ from typing import Union
 from bs4 import BeautifulSoup
 from ._markdownify import _CustomMarkdownify
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 class RssConverter(DocumentConverter):
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@ -1,6 +1,5 @@
 from typing import Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._media_converter import MediaConverter
 # Optional Transcription support
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -3,8 +3,7 @@ import re
 from typing import Any, Union
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -2,8 +2,7 @@ import sys
 from typing import Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -7,8 +7,7 @@ from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 # Optional YouTube transcription support
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -3,8 +3,7 @@ import zipfile
 import shutil
 from typing import Any, Union
-from ._base import DocumentConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._base_converter import DocumentConverterResult
 class ZipConverter(DocumentConverter):