Initial work updating signatures.

2025-03-03 13:16:15 -08:00 · 2025-03-03 13:16:15 -08:00 · e43632b048
commit e43632b048
parent 1d2f231146
22 changed files with 180 additions and 85 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -4,6 +4,7 @@

 from .__about__ import __version__
 from ._markitdown import MarkItDown
+from ._base_converter import DocumentConverterResult, BaseDocumentConverter
 from ._exceptions import (
    MarkItDownException,
    MissingDependencyException,
@ -11,12 +12,13 @@ from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
 )
-from .converters import DocumentConverter, DocumentConverterResult
+from .converters import DocumentConverter

 __all__ = [
    "__version__",
    "MarkItDown",
    "DocumentConverter",
+    "BaseDocumentConverter",
    "DocumentConverterResult",
    "MarkItDownException",
    "MissingDependencyException",
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -0,0 +1,116 @@
+from typing import Any, Union, BinaryIO, Optional
+
+
+class DocumentConverterResult:
+    """The result of converting a document to Markdown."""
+
+    def __init__(
+        self,
+        markdown: str,
+        *,
+        title: Optional[str] = None,
+    ):
+        """
+        Initialize the DocumentConverterResult.
+
+        Parameters:
+        - markdown: The converted Markdown text.
+        - title: Optional title of the document.
+        """
+        self.markdown = markdown
+        self.title = title
+
+    @property
+    def text_content(self) -> str:
+        """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
+        return self.markdown
+
+    @text_content.setter
+    def text_content(self, markdown: str):
+        """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
+        self.markdown = markdown
+
+    def __str__(self) -> str:
+        """Return the Markdown content."""
+        return self.markdown
+
+
+class BaseDocumentConverter:
+    """Abstract superclass of all DocumentConverters."""
+
+    # Lower priority values are tried first.
+    PRIORITY_SPECIFIC_FILE_FORMAT = (
+        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
+    )
+    PRIORITY_GENERIC_FILE_FORMAT = (
+        10.0  # Near catch-all converters for mimetypes like text/*, etc.
+    )
+
+    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
+        """
+        Initialize the DocumentConverter with a given priority.
+
+        Priorities work as follows: By default, most converters get priority
+        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
+        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
+        with lower values being tried first (i.e., higher priority).
+
+        Just prior to conversion, the converters are sorted by priority, using
+        a stable sort. This means that converters with the same priority will
+        remain in the same order, with the most recently registered converters
+        appearing first.
+
+        We have tight control over the order of built-in converters, but
+        plugins can register converters in any order. A converter's priority
+        field reasserts some control over the order of converters.
+
+        Plugins can register converters with any priority, to appear before or
+        after the built-ins. For example, a plugin with priority 9 will run
+        before the PlainTextConverter, but after the built-in converters.
+        """
+        self._priority = priority
+
+    def convert(
+        self,
+        file_stream,
+        *,
+        mime_type: str = "application/octet-stream",
+        file_extension: Optional[str] = None,
+        charset: Optional[str] = None,
+        **kwargs: Any,
+    ) -> Union[None, DocumentConverterResult]:
+        """
+        Convert a document to Markdown text, or return None if the converter
+        cannot handle the document (causing the next converter to be tried).
+
+        The determination of whether a converter can handle a document is primarily based on
+        the provided MIME type. The file extension can serve as a secondary check if the
+        MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the
+        chatset is used to determine the encoding of the file content in cases of text/*
+
+        Prameters:
+        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
+        - mime_type: The MIME type of the file. Default is "application/octet-stream".
+        - file_extension: The file extension of the file. Default is None.
+        - charset: The character set of the file. Default is None.
+        - kwargs: Additional keyword arguments for the converter.
+
+        Returns:
+        - DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
+        or
+        - None: If the converter cannot handle the document.
+
+        Raises:
+        - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
+        - MissingDependencyException: If the converter requires a dependency that is not installed.
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
+    @property
+    def priority(self) -> float:
+        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
+        return self._priority
+
+    @priority.setter
+    def priority(self, value: float):
+        self._priority = value
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -18,7 +18,6 @@ import requests

 from .converters import (
    DocumentConverter,
-    DocumentConverterResult,
    PlainTextConverter,
    HtmlConverter,
    RssConverter,
@ -39,6 +38,8 @@ from .converters import (
    DocumentIntelligenceConverter,
 )

+from ._base_converter import DocumentConverterResult
+
 from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: MIT

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from ._plain_text_converter import PlainTextConverter
 from ._html_converter import HtmlConverter
 from ._rss_converter import RssConverter
@ -23,7 +23,6 @@ from ._doc_intel_converter import DocumentIntelligenceConverter

 __all__ = [
    "DocumentConverter",
-    "DocumentConverterResult",
    "PlainTextConverter",
    "HtmlConverter",
    "RssConverter",
--- a/packages/markitdown/src/markitdown/converters/_base.py
+++ b/packages/markitdown/src/markitdown/converters/_base.py
@ -1,12 +1,5 @@
 from typing import Any, Union
-
-
-class DocumentConverterResult:
-    """The result of converting a document to text."""
-
-    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
-        self.title: Union[str, None] = title
-        self.text_content: str = text_content
+from .._base_converter import DocumentConverterResult


 class DocumentConverter:
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -6,7 +6,8 @@ from typing import Union
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify


@ -81,6 +82,6 @@ class BingSerpConverter(DocumentConverter):
        )

        return DocumentConverterResult(
+            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
        )
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -2,7 +2,8 @@ from typing import Any, Union
 import re
 import sys

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException

 # Try loading optional (but in this case, required) dependencies
@ -103,7 +104,4 @@ class DocumentIntelligenceConverter(DocumentConverter):

        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
-        return DocumentConverterResult(
-            title=None,
-            text_content=markdown_text,
-        )
+        return DocumentConverterResult(markdown=markdown_text)
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -2,11 +2,8 @@ import sys

 from typing import Union

-from ._base import (
-    DocumentConverterResult,
-)
-
 from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -1,7 +1,8 @@
 from typing import Any, Union
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify


@ -51,6 +52,6 @@ class HtmlConverter(DocumentConverter):
        webpage_text = webpage_text.strip()

        return DocumentConverterResult(
+            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
        )
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,5 +1,6 @@
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._media_converter import MediaConverter
 import base64
 import mimetypes
@ -59,8 +60,7 @@ class ImageConverter(MediaConverter):
            )

        return DocumentConverterResult(
-            title=None,
-            text_content=md_content,
+            markdown=md_content,
        )

    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -1,10 +1,8 @@
 import json
 from typing import Any, Union

-from ._base import (
-    DocumentConverter,
-    DocumentConverterResult,
-)
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult

 from .._exceptions import FileConversionException

@ -65,8 +63,8 @@ class IpynbConverter(DocumentConverter):
            title = notebook_content.get("metadata", {}).get("title", title)

            return DocumentConverterResult(
+                markdown=md_text,
                title=title,
-                text_content=md_text,
            )

        except Exception as e:
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@ -1,6 +1,7 @@
 import tempfile
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings

@ -83,7 +84,4 @@ class Mp3Converter(WavConverter):
                os.unlink(temp_path)

        # Return the result
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
+        return DocumentConverterResult(markdown=md_content.strip())
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -1,6 +1,7 @@
 import sys
 from typing import Any, Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 # Try loading optional (but in this case, required) dependencies
@ -73,7 +74,8 @@ class OutlookMsgConverter(DocumentConverter):
            msg.close()

            return DocumentConverterResult(
-                title=headers.get("Subject"), text_content=md_content.strip()
+                markdown=md_content.strip(),
+                title=headers.get("Subject"),
            )

        except Exception as e:
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -1,6 +1,7 @@
 import sys
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

 # Try loading optional (but in this case, required) dependencies
@ -43,6 +44,5 @@ class PdfConverter(DocumentConverter):
            )  # Restore the original traceback

        return DocumentConverterResult(
-            title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
+            markdown=pdfminer.high_level.extract_text(local_path)
        )
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -3,7 +3,8 @@ import mimetypes
 from charset_normalizer import from_path
 from typing import Any, Union

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult


 # Mimetypes to ignore (commonly confused extensions)
@ -43,7 +44,4 @@ class PlainTextConverter(DocumentConverter):
            return None

        text_content = str(from_path(local_path).best())
-        return DocumentConverterResult(
-            title=None,
-            text_content=text_content,
-        )
+        return DocumentConverterResult(markdown=text_content)
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -5,7 +5,8 @@ import sys

 from typing import Union

-from ._base import DocumentConverterResult, DocumentConverter
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter):
                    md_content += notes_frame.text
                md_content = md_content.strip()

-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
+        return DocumentConverterResult(markdown=md_content.strip())

    def _is_picture(self, shape):
        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -3,7 +3,8 @@ from typing import Union
 from bs4 import BeautifulSoup

 from ._markdownify import _CustomMarkdownify
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult


 class RssConverter(DocumentConverter):
@ -73,8 +74,8 @@ class RssConverter(DocumentConverter):
                    md_text += self._parse_content(entry_content)

            return DocumentConverterResult(
+                markdown=md_text,
                title=title,
-                text_content=md_text,
            )
        except BaseException as _:
            return None
@ -117,8 +118,8 @@ class RssConverter(DocumentConverter):
                    md_text += self._parse_content(content)

            return DocumentConverterResult(
+                markdown=md_text,
                title=channel_title,
-                text_content=md_text,
            )
        except BaseException as _:
            print(traceback.format_exc())
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@ -1,5 +1,6 @@
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._media_converter import MediaConverter

 # Optional Transcription support
@ -60,10 +61,7 @@ class WavConverter(MediaConverter):
                    "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
                )

-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
+        return DocumentConverterResult(markdown=md_content.strip())

    def _transcribe_audio(self, local_path) -> str:
        recognizer = sr.Recognizer()
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -3,7 +3,8 @@ import re
 from typing import Any, Union
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify


@ -56,6 +57,6 @@ class WikipediaConverter(DocumentConverter):
            webpage_text = _CustomMarkdownify().convert_soup(soup)

        return DocumentConverterResult(
+            markdown=webpage_text,
            title=main_title,
-            text_content=webpage_text,
        )
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -2,7 +2,8 @@ import sys

 from typing import Union

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

@ -58,10 +59,7 @@ class XlsxConverter(HtmlConverter):
            html_content = sheets[s].to_html(index=False)
            md_content += self._convert(html_content).text_content.strip() + "\n\n"

-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
+        return DocumentConverterResult(markdown=md_content.strip())


 class XlsConverter(HtmlConverter):
@ -94,7 +92,4 @@ class XlsConverter(HtmlConverter):
            html_content = sheets[s].to_html(index=False)
            md_content += self._convert(html_content).text_content.strip() + "\n\n"

-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
+        return DocumentConverterResult(markdown=md_content.strip())
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -7,7 +7,8 @@ from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult


 # Optional YouTube transcription support
@ -158,8 +159,8 @@ class YouTubeConverter(DocumentConverter):
        assert isinstance(title, str)

        return DocumentConverterResult(
+            markdown=webpage_text,
            title=title,
-            text_content=webpage_text,
        )

    def _get(
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -3,7 +3,8 @@ import zipfile
 import shutil
 from typing import Any, Union

-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
+from .._base_converter import DocumentConverterResult


 class ZipConverter(DocumentConverter):
@ -62,8 +63,7 @@ class ZipConverter(DocumentConverter):
        parent_converters = kwargs.get("_parent_converters", [])
        if not parent_converters:
            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
+                markdown=f"[ERROR] No converters available to process zip contents from: {local_path}",
            )

        extracted_zip_folder_name = (
@ -118,27 +118,24 @@ class ZipConverter(DocumentConverter):
                        result = converter.convert(file_path, **file_kwargs)
                        if result is not None:
                            md_content += f"\n## File: {relative_path}\n\n"
-                            md_content += result.text_content + "\n\n"
+                            md_content += result.markdown + "\n\n"
                            break

            # Clean up extracted files if specified
            if kwargs.get("cleanup_extracted", True):
                shutil.rmtree(extraction_dir)

-            return DocumentConverterResult(title=None, text_content=md_content.strip())
+            return DocumentConverterResult(markdown=md_content.strip())

        except zipfile.BadZipFile:
            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
+                markdown=f"[ERROR] Invalid or corrupted zip file: {local_path}",
            )
        except ValueError as ve:
            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
+                markdown=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
            )
        except Exception as e:
            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
+                markdown=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
            )