enhanceextraimage
This commit is contained in:
parent
4eb0107e2f
commit
a0b3cf7de3
2 changed files with 2 additions and 8 deletions
|
|
@ -61,6 +61,7 @@ class HtmlConverter(DocumentConverter):
|
||||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
# remove leading and trailing \n
|
# remove leading and trailing \n
|
||||||
webpage_text = webpage_text.strip()
|
webpage_text = webpage_text.strip()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -104,7 +104,6 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
Supports categorized storage in subfolders by document name
|
Supports categorized storage in subfolders by document name
|
||||||
"""
|
"""
|
||||||
alt = el.attrs.get("alt", None) or ""
|
alt = el.attrs.get("alt", None) or ""
|
||||||
# src = el.attrs.get("src", None) or ""
|
|
||||||
src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
|
src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
|
||||||
title = el.attrs.get("title", None) or ""
|
title = el.attrs.get("title", None) or ""
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
|
|
@ -142,29 +141,23 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
if hasattr(self, 'conversion_name') and self.conversion_name:
|
if hasattr(self, 'conversion_name') and self.conversion_name:
|
||||||
# If conversion_name exists, create subfolder
|
# If conversion_name exists, create subfolder
|
||||||
output_dir = os.path.join(self.image_output_dir, self.conversion_name)
|
output_dir = os.path.join(self.image_output_dir, self.conversion_name)
|
||||||
print(f"[DEBUG] Using subfolder for image: {output_dir}")
|
|
||||||
else:
|
else:
|
||||||
# Otherwise use base directory
|
# Otherwise use base directory
|
||||||
output_dir = self.image_output_dir
|
output_dir = self.image_output_dir
|
||||||
print(f"[DEBUG] Using base directory for image: {output_dir}")
|
|
||||||
|
|
||||||
# Ensure directory exists
|
# Ensure directory exists
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
print(f"[DEBUG] Ensuring directory exists: {output_dir}")
|
|
||||||
|
|
||||||
# Save image file
|
# Save image file
|
||||||
filepath = os.path.join(output_dir, filename)
|
filepath = os.path.join(output_dir, filename)
|
||||||
with open(filepath, "wb") as f:
|
with open(filepath, "wb") as f:
|
||||||
f.write(image_data)
|
f.write(image_data)
|
||||||
print(f"[DEBUG] Image saved to: {filepath}")
|
|
||||||
|
|
||||||
# Update src to relative path
|
# Update src to relative path
|
||||||
src = os.path.join(output_dir, filename).replace("\\", "/")
|
src = os.path.join(output_dir, filename).replace("\\", "/")
|
||||||
print(f"[DEBUG] Updated image path to: {src}")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Error saving image: {str(e)}"
|
error_msg = f"Error saving image: {str(e)}"
|
||||||
print(f"[ERROR] {error_msg}", file=sys.stderr)
|
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc(file=sys.stderr)
|
traceback.print_exc(file=sys.stderr)
|
||||||
# If extraction fails, revert to original truncating behavior
|
# If extraction fails, revert to original truncating behavior
|
||||||
|
|
@ -174,7 +167,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
# Process other data URIs that are not images (truncate them)
|
# Process other data URIs that are not images (truncate them)
|
||||||
elif src.startswith("data:") and not self.options.get("keep_data_uris", False):
|
elif src.startswith("data:") and not self.options.get("keep_data_uris", False):
|
||||||
src = src.split(",")[0] + "..."
|
src = src.split(",")[0] + "..."
|
||||||
|
|
||||||
# Return Markdown format image reference
|
# Return Markdown format image reference
|
||||||
return f""
|
return f""
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue