From 4eb0107e2f54036e01dc750d103d53dd8dbfaebf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=98=8A=E5=A4=A9?= Date: Wed, 30 Apr 2025 13:55:38 +1200 Subject: [PATCH] enhancewebsite --- .../src/markitdown/converters/_markdownify.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 47a5f58..25ab57b 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -104,7 +104,8 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): Supports categorized storage in subfolders by document name """ alt = el.attrs.get("alt", None) or "" - src = el.attrs.get("src", None) or "" + # src = el.attrs.get("src", None) or "" + src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" @@ -166,8 +167,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): print(f"[ERROR] {error_msg}", file=sys.stderr) import traceback traceback.print_exc(file=sys.stderr) + # If extraction fails, revert to original truncating behavior + src = src.split(",")[0] + "..." return f"![{alt}](image_error.png) " - + + # Process other data URIs that are not images (truncate them) + elif src.startswith("data:") and not self.options.get("keep_data_uris", False): + src = src.split(",")[0] + "..." + # Return Markdown format image reference return f"![{alt}]({src}{title_part})"