Work started moving converters to individual files.

2025-02-09 10:33:42 -08:00 · 2025-02-09 10:33:42 -08:00 · 71fa94e3c9
commit 71fa94e3c9
parent 73ba69d8cd
8 changed files with 320 additions and 182 deletions
--- a/src/markitdown/init.py
+++ b/src/markitdown/init.py
@ -2,10 +2,21 @@
 #
 # SPDX-License-Identifier: MIT
-from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
+from ._markitdown import MarkItDown
 from ._exceptions import (
    MarkItDownException,
    ConverterPrerequisiteException,
    FileConversionException,
    UnsupportedFormatException,
 )
 from .converters import DocumentConverter, DocumentConverterResult
 __all__ = [
    "MarkItDown",
    "DocumentConverter",
    "DocumentConverterResult",
    "MarkItDownException",
    "ConverterPrerequisiteException",
    "FileConversionException",
    "UnsupportedFormatException",
 ]
--- a/src/markitdown/_exceptions.py
+++ b/src/markitdown/_exceptions.py
@ -0,0 +1,37 @@
 class MarkItDownException(BaseException):
    """
    Base exception class for MarkItDown.
    """
    pass
 class ConverterPrerequisiteException(MarkItDownException):
    """
    Thrown when instantiating a DocumentConverter in cases where
    a required library or dependency is not installed, an API key
    is not set, or some other prerequisite is not met.
    This is not necessarily a fatal error. If thrown during
    MarkItDown's plugin loading phase, the converter will simply be
    skipped, and a warning will be issued.
    """
    pass
 class FileConversionException(MarkItDownException):
    """
    Thrown when a suitable converter was found, but the conversion
    process fails for any reason.
    """
    pass
 class UnsupportedFormatException(MarkItDownException):
    """
    Thrown when no suitable converter was found for the given file.
    """
    pass
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -13,6 +13,9 @@ import sys
 import tempfile
 import traceback
 import zipfile
 import importlib
 import sys
 from importlib.metadata import entry_points
 from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
 from pathlib import Path
@ -42,6 +45,21 @@ from azure.ai.documentintelligence.models import (
 )
 from azure.identity import DefaultAzureCredential
 from .converters import (
    DocumentConverter,
    DocumentConverterResult,
    PlainTextConverter,
    HtmlConverter,
 )
 from .converters._markdownify import _CustomMarkdownify
 from ._exceptions import (
    MarkItDownException,
    ConverterPrerequisiteException,
    FileConversionException,
    UnsupportedFormatException,
 )
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 # This constant is a temporary fix until the bug is resolved.
 CONTENT_FORMAT = "markdown"
@ -49,6 +67,9 @@ CONTENT_FORMAT = "markdown"
 # Override mimetype for csv to fix issue on windows
 mimetypes.add_type("text/csv", ".csv")
 PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
 PRIORITY_GENERIC_FILE_FORMAT = -10.0
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@ -76,178 +97,6 @@ except ModuleNotFoundError:
    pass
 class _CustomMarkdownify(markdownify.MarkdownConverter):
    """
    A custom version of markdownify's MarkdownConverter. Changes include:
    - Altering the default heading style to use '#', '##', etc.
    - Removing javascript hyperlinks.
    - Truncating images with large data:uri sources.
    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
    """
    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
        """Same as usual, but be sure to start with a new line"""
        if not convert_as_inline:
            if not re.search(r"^\n", text):
                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
        """Same as usual converter, but removes Javascript links and escapes URIs."""
        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
        if not text:
            return ""
        href = el.get("href")
        title = el.get("title")
        # Escape URIs and skip non-http or file schemes
        if href:
            try:
                parsed_url = urlparse(href)  # type: ignore
                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
                    return "%s%s%s" % (prefix, text, suffix)
                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
            except ValueError:  # It's not clear if this ever gets thrown
                return "%s%s%s" % (prefix, text, suffix)
        # For the replacement see #29: text nodes underscores are escaped
        if (
            self.options["autolinks"]
            and text.replace(r"\_", "_") == href
            and not title
            and not self.options["default_title"]
        ):
            # Shortcut syntax
            return "<%s>" % href
        if self.options["default_title"] and not title:
            title = href
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        return (
            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
            if href
            else text
        )
    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
        """Same as usual converter, but removes data URIs"""
        alt = el.attrs.get("alt", None) or ""
        src = el.attrs.get("src", None) or ""
        title = el.attrs.get("title", None) or ""
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        if (
            convert_as_inline
            and el.parent.name not in self.options["keep_inline_images_in"]
        ):
            return alt
        # Remove dataURIs
        if src.startswith("data:"):
            src = src.split(",")[0] + "..."
        return "![%s](%s%s)" % (alt, src, title_part)
    def convert_soup(self, soup: Any) -> str:
        return super().convert_soup(soup)  # type: ignore
 class DocumentConverterResult:
    """The result of converting a document to text."""
    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
        self.title: Union[str, None] = title
        self.text_content: str = text_content
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        raise NotImplementedError()
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Guess the content type from any file extension that might be around
        content_type, _ = mimetypes.guess_type(
            "__placeholder" + kwargs.get("file_extension", "")
        )
        # Only accept text files
        if content_type is None:
            return None
        elif all(
            not content_type.lower().startswith(type_prefix)
            for type_prefix in ["text/", "application/json"]
        ):
            return None
        text_content = str(from_path(local_path).best())
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
        )
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not html
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".html", ".htm"]:
            return None
        result = None
        with open(local_path, "rt", encoding="utf-8") as fh:
            result = self._convert(fh.read())
        return result
    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
        """Helper function that converts an HTML string."""
        # Parse the string
        soup = BeautifulSoup(html_content, "html.parser")
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
            script.extract()
        # Print only the main content
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
        else:
            webpage_text = _CustomMarkdownify().convert_soup(soup)
        assert isinstance(webpage_text, str)
        # remove leading and trailing \n
        webpage_text = webpage_text.strip()
        return DocumentConverterResult(
            title=None if soup.title is None else soup.title.string,
            text_content=webpage_text,
        )
 class RSSConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""
@ -1455,14 +1304,6 @@ class DocumentIntelligenceConverter(DocumentConverter):
        )
 class FileConversionException(BaseException):
    pass
 class UnsupportedFormatException(BaseException):
    pass
 class MarkItDown:
    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
    This reader will convert common file-types or webpages to Markdown."""
@ -1544,6 +1385,27 @@ class MarkItDown:
        self.register_page_converter(ZipConverter())
        self.register_page_converter(OutlookMsgConverter())
        #        print("Discovering plugins")
        #        for entry_point in entry_points(group="markitdown.converters"):
        #            args = {
        #                "required1": "Override1",
        #                "required2": "Override2",
        #                "required3": "Override3"
        #            }
        #
        #            #print(entry_point)
        #            plugin = entry_point.load()
        #            instance = plugin(**args)
        #            print(instance)
        #    try:
        #        ConverterClass = entry_point.load()
        #        self.register_page_converter(ConverterClass())
        #        print(f"✔ Registered converter: {entry_point.name}")
        #    except Exception as e:
        #        print(f" Failed to load {entry_point.name}: {e}")
        #        print("Done")
        # Register Document Intelligence converter at the top of the stack if endpoint is provided
        if docintel_endpoint is not None:
            self.register_page_converter(
@ -1691,8 +1553,14 @@ class MarkItDown:
        self, local_path: str, extensions: List[Union[str, None]], **kwargs
    ) -> DocumentConverterResult:
        error_trace = ""
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
        sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
        for ext in extensions + [None]:  # Try last with no extension
-            for converter in self._page_converters:
+            for converter in sorted_converters:
                _kwargs = copy.deepcopy(kwargs)
                # Overwrite file_extension appropriately
--- a/src/markitdown/converters/init.py
+++ b/src/markitdown/converters/init.py
@ -0,0 +1,14 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
 from ._base import DocumentConverter, DocumentConverterResult
 from ._plain_text_converter import PlainTextConverter
 from ._html_converter import HtmlConverter
 __all__ = [
    "DocumentConverter",
    "DocumentConverterResult",
    "PlainTextConverter",
    "HtmlConverter",
 ]
--- a/src/markitdown/converters/_base.py
+++ b/src/markitdown/converters/_base.py
@ -0,0 +1,34 @@
 from typing import Any, Union
 class DocumentConverterResult:
    """The result of converting a document to text."""
    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
        self.title: Union[str, None] = title
        self.text_content: str = text_content
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
    def __init__(self, priority: float = 0.0):
        self._priority = priority
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        raise NotImplementedError("Subclasses must implement this method")
    @property
    def priority(self) -> float:
        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
        return self._priority
    @priority.setter
    def radius(self, value: float):
        self._priority = value
    @priority.deleter
    def radius(self):
        raise AttributeError("Cannot delete the priority attribute")
--- a/src/markitdown/converters/_html_converter.py
+++ b/src/markitdown/converters/_html_converter.py
@ -0,0 +1,54 @@
 import re
 from typing import Any, Union
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from bs4 import BeautifulSoup
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not html
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".html", ".htm"]:
            return None
        result = None
        with open(local_path, "rt", encoding="utf-8") as fh:
            result = self._convert(fh.read())
        return result
    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
        """Helper function that converts an HTML string."""
        # Parse the string
        soup = BeautifulSoup(html_content, "html.parser")
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
            script.extract()
        # Print only the main content
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
        else:
            webpage_text = _CustomMarkdownify().convert_soup(soup)
        assert isinstance(webpage_text, str)
        # remove leading and trailing \n
        webpage_text = webpage_text.strip()
        return DocumentConverterResult(
            title=None if soup.title is None else soup.title.string,
            text_content=webpage_text,
        )
--- a/src/markitdown/converters/_markdownify.py
+++ b/src/markitdown/converters/_markdownify.py
@ -0,0 +1,87 @@
 import re
 import markdownify
 from typing import Any, Union
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 class _CustomMarkdownify(markdownify.MarkdownConverter):
    """
    A custom version of markdownify's MarkdownConverter. Changes include:
    - Altering the default heading style to use '#', '##', etc.
    - Removing javascript hyperlinks.
    - Truncating images with large data:uri sources.
    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
    """
    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
        """Same as usual, but be sure to start with a new line"""
        if not convert_as_inline:
            if not re.search(r"^\n", text):
                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
        """Same as usual converter, but removes Javascript links and escapes URIs."""
        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
        if not text:
            return ""
        href = el.get("href")
        title = el.get("title")
        # Escape URIs and skip non-http or file schemes
        if href:
            try:
                parsed_url = urlparse(href)  # type: ignore
                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
                    return "%s%s%s" % (prefix, text, suffix)
                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
            except ValueError:  # It's not clear if this ever gets thrown
                return "%s%s%s" % (prefix, text, suffix)
        # For the replacement see #29: text nodes underscores are escaped
        if (
            self.options["autolinks"]
            and text.replace(r"\_", "_") == href
            and not title
            and not self.options["default_title"]
        ):
            # Shortcut syntax
            return "<%s>" % href
        if self.options["default_title"] and not title:
            title = href
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        return (
            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
            if href
            else text
        )
    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
        """Same as usual converter, but removes data URIs"""
        alt = el.attrs.get("alt", None) or ""
        src = el.attrs.get("src", None) or ""
        title = el.attrs.get("title", None) or ""
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        if (
            convert_as_inline
            and el.parent.name not in self.options["keep_inline_images_in"]
        ):
            return alt
        # Remove dataURIs
        if src.startswith("data:"):
            src = src.split(",")[0] + "..."
        return "![%s](%s%s)" % (alt, src, title_part)
    def convert_soup(self, soup: Any) -> str:
        return super().convert_soup(soup)  # type: ignore
--- a/src/markitdown/converters/_plain_text_converter.py
+++ b/src/markitdown/converters/_plain_text_converter.py
@ -0,0 +1,33 @@
 import mimetypes
 from charset_normalizer import from_path
 from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Guess the content type from any file extension that might be around
        content_type, _ = mimetypes.guess_type(
            "__placeholder" + kwargs.get("file_extension", "")
        )
        # Only accept text files
        if content_type is None:
            return None
        elif all(
            not content_type.lower().startswith(type_prefix)
            for type_prefix in ["text/", "application/json"]
        ):
            return None
        text_content = str(from_path(local_path).best())
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
        )