Updated markdownify dependency.

This commit is contained in:
Adam Fourney 2025-03-05 13:03:48 -08:00
parent cc38144752
commit 4d097aa379
4 changed files with 34 additions and 8 deletions

View file

@ -26,7 +26,7 @@ classifiers = [
dependencies = [
"beautifulsoup4",
"requests",
"markdownify~=0.14.1",
"markdownify",
"puremagic",
"pathvalidate",
"charset-normalizer",
@ -78,11 +78,14 @@ extra-dependencies = [
]
[tool.hatch.envs.types]
features = ["all"]
extra-dependencies = [
"openai",
"mypy>=1.0.0",
]
[tool.hatch.envs.types.scripts]
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
[tool.coverage.run]
source_pkgs = ["markitdown", "tests"]

View file

@ -68,6 +68,9 @@ class FileConversionException(MarkItDownException):
else:
message = f"File conversion failed after {len(attempts)} attempts:\n"
for attempt in attempts:
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
if attempt.exc_info is None:
message += " - {type(attempt.converter).__name__} provided no execution info."
else:
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
super().__init__(message)

View file

@ -62,7 +62,8 @@ def _guess_stream_info_from_stream(
# Add a guess purely based on the filename hint
if filename_hint:
try:
mimetype, _ = mimetypes.guess_file_type(filename_hint)
# Requires Python 3.13+
mimetype, _ = mimetypes.guess_file_type(filename_hint) # type: ignore
except AttributeError:
mimetype, _ = mimetypes.guess_type(filename_hint)

View file

@ -1,7 +1,7 @@
import re
import markdownify
from typing import Any
from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse
@ -20,7 +20,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line"""
if not convert_as_inline:
if not re.search(r"^\n", text):
@ -28,7 +35,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
def convert_a(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
):
"""Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
@ -68,7 +81,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text
)
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
def convert_img(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or ""