Updated markdownify dependency.

2025-03-05 13:03:48 -08:00 · 2025-03-05 13:03:48 -08:00 · 4d097aa379
commit 4d097aa379
parent cc38144752
4 changed files with 34 additions and 8 deletions
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@ -26,7 +26,7 @@ classifiers = [
 dependencies = [
  "beautifulsoup4",
  "requests",
-  "markdownify~=0.14.1",
+  "markdownify",
  "puremagic",
  "pathvalidate",
  "charset-normalizer",
@ -78,11 +78,14 @@ extra-dependencies = [
 ]

 [tool.hatch.envs.types]
+features = ["all"]
 extra-dependencies = [
+  "openai",
  "mypy>=1.0.0",
 ]
+
 [tool.hatch.envs.types.scripts]
-check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
+check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"

 [tool.coverage.run]
 source_pkgs = ["markitdown", "tests"]
--- a/packages/markitdown/src/markitdown/_exceptions.py
+++ b/packages/markitdown/src/markitdown/_exceptions.py
@ -68,6 +68,9 @@ class FileConversionException(MarkItDownException):
            else:
                message = f"File conversion failed after {len(attempts)} attempts:\n"
                for attempt in attempts:
-                    message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
+                    if attempt.exc_info is None:
+                        message += " -  {type(attempt.converter).__name__} provided no execution info."
+                    else:
+                        message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"

        super().__init__(message)
--- a/packages/markitdown/src/markitdown/_stream_info.py
+++ b/packages/markitdown/src/markitdown/_stream_info.py
@ -62,7 +62,8 @@ def _guess_stream_info_from_stream(
    # Add a guess purely based on the filename hint
    if filename_hint:
        try:
-            mimetype, _ = mimetypes.guess_file_type(filename_hint)
+            # Requires Python 3.13+
+            mimetype, _ = mimetypes.guess_file_type(filename_hint)  # type: ignore
        except AttributeError:
            mimetype, _ = mimetypes.guess_type(filename_hint)

--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@ -1,7 +1,7 @@
 import re
 import markdownify

-from typing import Any
+from typing import Any, Optional
 from urllib.parse import quote, unquote, urlparse, urlunparse


@ -20,7 +20,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)

-    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
+    def convert_hn(
+        self,
+        n: int,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ) -> str:
        """Same as usual, but be sure to start with a new line"""
        if not convert_as_inline:
            if not re.search(r"^\n", text):
@ -28,7 +35,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore

-    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
+    def convert_a(
+        self,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ):
        """Same as usual converter, but removes Javascript links and escapes URIs."""
        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
        if not text:
@ -68,7 +81,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            else text
        )

-    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
+    def convert_img(
+        self,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ) -> str:
        """Same as usual converter, but removes data URIs"""

        alt = el.attrs.get("alt", None) or ""