Work started moving converters to individual files.

2025-02-09 10:33:42 -08:00 · 2025-02-09 10:33:42 -08:00 · 71fa94e3c9
commit 71fa94e3c9
parent 73ba69d8cd
8 changed files with 320 additions and 182 deletions
--- a/src/markitdown/init.py
+++ b/src/markitdown/init.py
@ -2,10 +2,21 @@
 #
 # SPDX-License-Identifier: MIT

-from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
+from ._markitdown import MarkItDown
+from ._exceptions import (
+    MarkItDownException,
+    ConverterPrerequisiteException,
+    FileConversionException,
+    UnsupportedFormatException,
+)
+from .converters import DocumentConverter, DocumentConverterResult

 __all__ = [
    "MarkItDown",
+    "DocumentConverter",
+    "DocumentConverterResult",
+    "MarkItDownException",
+    "ConverterPrerequisiteException",
    "FileConversionException",
    "UnsupportedFormatException",
 ]
--- a/src/markitdown/_exceptions.py
+++ b/src/markitdown/_exceptions.py
@ -0,0 +1,37 @@
+class MarkItDownException(BaseException):
+    """
+    Base exception class for MarkItDown.
+    """
+
+    pass
+
+
+class ConverterPrerequisiteException(MarkItDownException):
+    """
+    Thrown when instantiating a DocumentConverter in cases where
+    a required library or dependency is not installed, an API key
+    is not set, or some other prerequisite is not met.
+
+    This is not necessarily a fatal error. If thrown during
+    MarkItDown's plugin loading phase, the converter will simply be
+    skipped, and a warning will be issued.
+    """
+
+    pass
+
+
+class FileConversionException(MarkItDownException):
+    """
+    Thrown when a suitable converter was found, but the conversion
+    process fails for any reason.
+    """
+
+    pass
+
+
+class UnsupportedFormatException(MarkItDownException):
+    """
+    Thrown when no suitable converter was found for the given file.
+    """
+
+    pass
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -13,6 +13,9 @@ import sys
 import tempfile
 import traceback
 import zipfile
+import importlib
+import sys
+from importlib.metadata import entry_points
 from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
 from pathlib import Path
@ -42,6 +45,21 @@ from azure.ai.documentintelligence.models import (
 )
 from azure.identity import DefaultAzureCredential

+from .converters import (
+    DocumentConverter,
+    DocumentConverterResult,
+    PlainTextConverter,
+    HtmlConverter,
+)
+from .converters._markdownify import _CustomMarkdownify
+
+from ._exceptions import (
+    MarkItDownException,
+    ConverterPrerequisiteException,
+    FileConversionException,
+    UnsupportedFormatException,
+)
+
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 # This constant is a temporary fix until the bug is resolved.
 CONTENT_FORMAT = "markdown"
@ -49,6 +67,9 @@ CONTENT_FORMAT = "markdown"
 # Override mimetype for csv to fix issue on windows
 mimetypes.add_type("text/csv", ".csv")

+PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
+PRIORITY_GENERIC_FILE_FORMAT = -10.0
+
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@ -76,178 +97,6 @@ except ModuleNotFoundError:
    pass


-class _CustomMarkdownify(markdownify.MarkdownConverter):
-    """
-    A custom version of markdownify's MarkdownConverter. Changes include:
-
-    - Altering the default heading style to use '#', '##', etc.
-    - Removing javascript hyperlinks.
-    - Truncating images with large data:uri sources.
-    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
-    """
-
-    def __init__(self, **options: Any):
-        options["heading_style"] = options.get("heading_style", markdownify.ATX)
-        # Explicitly cast options to the expected type if necessary
-        super().__init__(**options)
-
-    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
-        """Same as usual, but be sure to start with a new line"""
-        if not convert_as_inline:
-            if not re.search(r"^\n", text):
-                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
-
-        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
-
-    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
-        """Same as usual converter, but removes Javascript links and escapes URIs."""
-        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
-        if not text:
-            return ""
-        href = el.get("href")
-        title = el.get("title")
-
-        # Escape URIs and skip non-http or file schemes
-        if href:
-            try:
-                parsed_url = urlparse(href)  # type: ignore
-                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
-                    return "%s%s%s" % (prefix, text, suffix)
-                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
-            except ValueError:  # It's not clear if this ever gets thrown
-                return "%s%s%s" % (prefix, text, suffix)
-
-        # For the replacement see #29: text nodes underscores are escaped
-        if (
-            self.options["autolinks"]
-            and text.replace(r"\_", "_") == href
-            and not title
-            and not self.options["default_title"]
-        ):
-            # Shortcut syntax
-            return "<%s>" % href
-        if self.options["default_title"] and not title:
-            title = href
-        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        return (
-            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
-            if href
-            else text
-        )
-
-    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
-        """Same as usual converter, but removes data URIs"""
-
-        alt = el.attrs.get("alt", None) or ""
-        src = el.attrs.get("src", None) or ""
-        title = el.attrs.get("title", None) or ""
-        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        if (
-            convert_as_inline
-            and el.parent.name not in self.options["keep_inline_images_in"]
-        ):
-            return alt
-
-        # Remove dataURIs
-        if src.startswith("data:"):
-            src = src.split(",")[0] + "..."
-
-        return "![%s](%s%s)" % (alt, src, title_part)
-
-    def convert_soup(self, soup: Any) -> str:
-        return super().convert_soup(soup)  # type: ignore
-
-
-class DocumentConverterResult:
-    """The result of converting a document to text."""
-
-    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
-        self.title: Union[str, None] = title
-        self.text_content: str = text_content
-
-
-class DocumentConverter:
-    """Abstract superclass of all DocumentConverters."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        raise NotImplementedError()
-
-
-class PlainTextConverter(DocumentConverter):
-    """Anything with content type text/plain"""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Guess the content type from any file extension that might be around
-        content_type, _ = mimetypes.guess_type(
-            "__placeholder" + kwargs.get("file_extension", "")
-        )
-
-        # Only accept text files
-        if content_type is None:
-            return None
-        elif all(
-            not content_type.lower().startswith(type_prefix)
-            for type_prefix in ["text/", "application/json"]
-        ):
-            return None
-
-        text_content = str(from_path(local_path).best())
-        return DocumentConverterResult(
-            title=None,
-            text_content=text_content,
-        )
-
-
-class HtmlConverter(DocumentConverter):
-    """Anything with content type text/html"""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not html
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-
-        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            result = self._convert(fh.read())
-
-        return result
-
-    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts an HTML string."""
-
-        # Parse the string
-        soup = BeautifulSoup(html_content, "html.parser")
-
-        # Remove javascript and style blocks
-        for script in soup(["script", "style"]):
-            script.extract()
-
-        # Print only the main content
-        body_elm = soup.find("body")
-        webpage_text = ""
-        if body_elm:
-            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
-        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
-
-        assert isinstance(webpage_text, str)
-
-        # remove leading and trailing \n
-        webpage_text = webpage_text.strip()
-
-        return DocumentConverterResult(
-            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
-        )
-
-
 class RSSConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""

@ -1455,14 +1304,6 @@ class DocumentIntelligenceConverter(DocumentConverter):
        )


-class FileConversionException(BaseException):
-    pass
-
-
-class UnsupportedFormatException(BaseException):
-    pass
-
-
 class MarkItDown:
    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
    This reader will convert common file-types or webpages to Markdown."""
@ -1544,6 +1385,27 @@ class MarkItDown:
        self.register_page_converter(ZipConverter())
        self.register_page_converter(OutlookMsgConverter())

+        #        print("Discovering plugins")
+        #        for entry_point in entry_points(group="markitdown.converters"):
+        #            args = {
+        #                "required1": "Override1",
+        #                "required2": "Override2",
+        #                "required3": "Override3"
+        #            }
+        #
+        #            #print(entry_point)
+        #            plugin = entry_point.load()
+        #            instance = plugin(**args)
+        #            print(instance)
+
+        #    try:
+        #        ConverterClass = entry_point.load()
+        #        self.register_page_converter(ConverterClass())
+        #        print(f"✔ Registered converter: {entry_point.name}")
+        #    except Exception as e:
+        #        print(f" Failed to load {entry_point.name}: {e}")
+        #        print("Done")
+
        # Register Document Intelligence converter at the top of the stack if endpoint is provided
        if docintel_endpoint is not None:
            self.register_page_converter(
@ -1691,8 +1553,14 @@ class MarkItDown:
        self, local_path: str, extensions: List[Union[str, None]], **kwargs
    ) -> DocumentConverterResult:
        error_trace = ""
+
+        # Create a copy of the page_converters list, sorted by priority.
+        # We do this with each call to _convert because the priority of converters may change between calls.
+        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
+        sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
+
        for ext in extensions + [None]:  # Try last with no extension
-            for converter in self._page_converters:
+            for converter in sorted_converters:
                _kwargs = copy.deepcopy(kwargs)

                # Overwrite file_extension appropriately
--- a/src/markitdown/converters/init.py
+++ b/src/markitdown/converters/init.py
@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._plain_text_converter import PlainTextConverter
+from ._html_converter import HtmlConverter
+
+__all__ = [
+    "DocumentConverter",
+    "DocumentConverterResult",
+    "PlainTextConverter",
+    "HtmlConverter",
+]
--- a/src/markitdown/converters/_base.py
+++ b/src/markitdown/converters/_base.py
@ -0,0 +1,34 @@
+from typing import Any, Union
+
+
+class DocumentConverterResult:
+    """The result of converting a document to text."""
+
+    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
+        self.title: Union[str, None] = title
+        self.text_content: str = text_content
+
+
+class DocumentConverter:
+    """Abstract superclass of all DocumentConverters."""
+
+    def __init__(self, priority: float = 0.0):
+        self._priority = priority
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    @property
+    def priority(self) -> float:
+        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
+        return self._priority
+
+    @priority.setter
+    def radius(self, value: float):
+        self._priority = value
+
+    @priority.deleter
+    def radius(self):
+        raise AttributeError("Cannot delete the priority attribute")
--- a/src/markitdown/converters/_html_converter.py
+++ b/src/markitdown/converters/_html_converter.py
@ -0,0 +1,54 @@
+import re
+
+from typing import Any, Union
+from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
+from bs4 import BeautifulSoup
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._markdownify import _CustomMarkdownify
+
+
+class HtmlConverter(DocumentConverter):
+    """Anything with content type text/html"""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not html
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+
+        result = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            result = self._convert(fh.read())
+
+        return result
+
+    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
+        """Helper function that converts an HTML string."""
+
+        # Parse the string
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+
+        # Print only the main content
+        body_elm = soup.find("body")
+        webpage_text = ""
+        if body_elm:
+            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+        else:
+            webpage_text = _CustomMarkdownify().convert_soup(soup)
+
+        assert isinstance(webpage_text, str)
+
+        # remove leading and trailing \n
+        webpage_text = webpage_text.strip()
+
+        return DocumentConverterResult(
+            title=None if soup.title is None else soup.title.string,
+            text_content=webpage_text,
+        )
--- a/src/markitdown/converters/_markdownify.py
+++ b/src/markitdown/converters/_markdownify.py
@ -0,0 +1,87 @@
+import re
+import markdownify
+
+from typing import Any, Union
+from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
+
+
+class _CustomMarkdownify(markdownify.MarkdownConverter):
+    """
+    A custom version of markdownify's MarkdownConverter. Changes include:
+
+    - Altering the default heading style to use '#', '##', etc.
+    - Removing javascript hyperlinks.
+    - Truncating images with large data:uri sources.
+    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
+    """
+
+    def __init__(self, **options: Any):
+        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+        # Explicitly cast options to the expected type if necessary
+        super().__init__(**options)
+
+    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Same as usual, but be sure to start with a new line"""
+        if not convert_as_inline:
+            if not re.search(r"^\n", text):
+                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+
+        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+
+    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
+        """Same as usual converter, but removes Javascript links and escapes URIs."""
+        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
+        if not text:
+            return ""
+        href = el.get("href")
+        title = el.get("title")
+
+        # Escape URIs and skip non-http or file schemes
+        if href:
+            try:
+                parsed_url = urlparse(href)  # type: ignore
+                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
+                    return "%s%s%s" % (prefix, text, suffix)
+                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
+            except ValueError:  # It's not clear if this ever gets thrown
+                return "%s%s%s" % (prefix, text, suffix)
+
+        # For the replacement see #29: text nodes underscores are escaped
+        if (
+            self.options["autolinks"]
+            and text.replace(r"\_", "_") == href
+            and not title
+            and not self.options["default_title"]
+        ):
+            # Shortcut syntax
+            return "<%s>" % href
+        if self.options["default_title"] and not title:
+            title = href
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        return (
+            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
+            if href
+            else text
+        )
+
+    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Same as usual converter, but removes data URIs"""
+
+        alt = el.attrs.get("alt", None) or ""
+        src = el.attrs.get("src", None) or ""
+        title = el.attrs.get("title", None) or ""
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        if (
+            convert_as_inline
+            and el.parent.name not in self.options["keep_inline_images_in"]
+        ):
+            return alt
+
+        # Remove dataURIs
+        if src.startswith("data:"):
+            src = src.split(",")[0] + "..."
+
+        return "![%s](%s%s)" % (alt, src, title_part)
+
+    def convert_soup(self, soup: Any) -> str:
+        return super().convert_soup(soup)  # type: ignore
--- a/src/markitdown/converters/_plain_text_converter.py
+++ b/src/markitdown/converters/_plain_text_converter.py
@ -0,0 +1,33 @@
+import mimetypes
+
+from charset_normalizer import from_path
+from typing import Any, Union
+
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class PlainTextConverter(DocumentConverter):
+    """Anything with content type text/plain"""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Guess the content type from any file extension that might be around
+        content_type, _ = mimetypes.guess_type(
+            "__placeholder" + kwargs.get("file_extension", "")
+        )
+
+        # Only accept text files
+        if content_type is None:
+            return None
+        elif all(
+            not content_type.lower().startswith(type_prefix)
+            for type_prefix in ["text/", "application/json"]
+        ):
+            return None
+
+        text_content = str(from_path(local_path).best())
+        return DocumentConverterResult(
+            title=None,
+            text_content=text_content,
+        )