Updated markdownify dependency.

This commit is contained in:
Adam Fourney 2025-03-05 13:03:48 -08:00
parent cc38144752
commit 4d097aa379
4 changed files with 34 additions and 8 deletions

View file

@ -26,7 +26,7 @@ classifiers = [
dependencies = [ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"markdownify~=0.14.1", "markdownify",
"puremagic", "puremagic",
"pathvalidate", "pathvalidate",
"charset-normalizer", "charset-normalizer",
@ -78,11 +78,14 @@ extra-dependencies = [
] ]
[tool.hatch.envs.types] [tool.hatch.envs.types]
features = ["all"]
extra-dependencies = [ extra-dependencies = [
"openai",
"mypy>=1.0.0", "mypy>=1.0.0",
] ]
[tool.hatch.envs.types.scripts] [tool.hatch.envs.types.scripts]
check = "mypy --install-types --non-interactive {args:src/markitdown tests}" check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
[tool.coverage.run] [tool.coverage.run]
source_pkgs = ["markitdown", "tests"] source_pkgs = ["markitdown", "tests"]

View file

@ -68,6 +68,9 @@ class FileConversionException(MarkItDownException):
else: else:
message = f"File conversion failed after {len(attempts)} attempts:\n" message = f"File conversion failed after {len(attempts)} attempts:\n"
for attempt in attempts: for attempt in attempts:
if attempt.exc_info is None:
message += " - {type(attempt.converter).__name__} provided no execution info."
else:
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n" message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
super().__init__(message) super().__init__(message)

View file

@ -62,7 +62,8 @@ def _guess_stream_info_from_stream(
# Add a guess purely based on the filename hint # Add a guess purely based on the filename hint
if filename_hint: if filename_hint:
try: try:
mimetype, _ = mimetypes.guess_file_type(filename_hint) # Requires Python 3.13+
mimetype, _ = mimetypes.guess_file_type(filename_hint) # type: ignore
except AttributeError: except AttributeError:
mimetype, _ = mimetypes.guess_type(filename_hint) mimetype, _ = mimetypes.guess_type(filename_hint)

View file

@ -1,7 +1,7 @@
import re import re
import markdownify import markdownify
from typing import Any from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse from urllib.parse import quote, unquote, urlparse, urlunparse
@ -20,7 +20,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line""" """Same as usual, but be sure to start with a new line"""
if not convert_as_inline: if not convert_as_inline:
if not re.search(r"^\n", text): if not re.search(r"^\n", text):
@ -28,7 +35,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(self, el: Any, text: str, convert_as_inline: bool): def convert_a(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
):
"""Same as usual converter, but removes Javascript links and escapes URIs.""" """Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text: if not text:
@ -68,7 +81,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text else text
) )
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: def convert_img(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual converter, but removes data URIs""" """Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or "" alt = el.attrs.get("alt", None) or ""