Merge pull request #1 from Noah-Zhuhaotian/dev

Dev
This commit is contained in:
Noah Zhu 2025-04-30 17:17:08 +12:00 committed by GitHub
commit 2524c00089
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 123 additions and 134 deletions

View file

@ -1,9 +1,12 @@
import sys
import os
import re
import unicodedata
from typing import BinaryIO, Any
import base64
import hashlib
from typing import BinaryIO, Any, Dict, List, Tuple
from io import BytesIO
import json
from bs4 import BeautifulSoup
from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
@ -30,7 +33,8 @@ ACCEPTED_FILE_EXTENSIONS = [".docx"]
class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
Extracts images from documents and saves them to document-specific subfolders.
"""
def __init__(self):
@ -55,63 +59,106 @@ class DocxConverter(HtmlConverter):
return False
def _sanitize_filename(self, filename: str) -> str:
"""
Sanitize a filename by removing or replacing problematic characters.
Args:
filename: The original filename
Returns:
A sanitized filename safe for filesystem use
"""
# Step 1: Normalize unicode characters
filename = unicodedata.normalize('NFKD', filename)
# Step 2: Remove invalid characters and replace spaces with underscores
# Keep alphanumeric characters, underscores, hyphens, and periods
sanitized = re.sub(r'[^\w\-\.]', '_', filename)
# Step 3: Collapse multiple underscores
sanitized = re.sub(r'_+', '_', sanitized)
# Step 4: Remove leading/trailing underscores
sanitized = sanitized.strip('_')
# Step 5: Ensure we have a valid filename (default if empty)
if not sanitized:
sanitized = "unnamed"
return sanitized
def _get_document_name(self, stream_info: StreamInfo) -> str:
"""
Extract document name from StreamInfo
Extract document name from StreamInfo and sanitize it
"""
# First try to extract from filename attribute
if stream_info.filename:
basename = os.path.basename(stream_info.filename)
name, _ = os.path.splitext(basename)
if name:
return self._sanitize_filename(name)
return name
# If local_path exists, try to extract from local path
if stream_info.local_path:
basename = os.path.basename(stream_info.local_path)
name, _ = os.path.splitext(basename)
if name:
return self._sanitize_filename(name)
return name
# If URL exists, try to extract from URL
if stream_info.url:
basename = os.path.basename(stream_info.url)
name, _ = os.path.splitext(basename)
if name:
return self._sanitize_filename(name)
print(f"[DEBUG] Extracted document name from URL: {name}")
return name
# Default name
return "docx_document"
def _extract_and_save_images(
self, html_content: str, doc_folder: str, assets_folder: str = "assets"
) -> str:
"""
Extract base64 images from HTML content, save them to filesystem, and update HTML with new image paths
Args:
html_content: The HTML content containing images
doc_folder: The document-specific folder name
assets_folder: The base folder for assets
Returns:
Updated HTML content with image references pointing to saved files
"""
# Parse HTML
soup = BeautifulSoup(html_content, "html.parser")
# Find all images
images = soup.find_all("img")
if not images:
return html_content
# Create output directory
output_dir = os.path.join(assets_folder, doc_folder)
os.makedirs(output_dir, exist_ok=True)
# Process each image
for img in images:
src = img.get("src", "") or img.get("data-src", "")
if not src or not src.startswith("data:image"):
continue
try:
# Parse image data
mime_type = src.split(";")[0].replace("data:", "")
# Get file extension
ext = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/gif": ".gif",
}.get(mime_type, ".png")
# Extract base64 data
encoded_data = src.split(",", 1)[1]
image_data = base64.b64decode(encoded_data)
# Generate unique filename
hashname = hashlib.sha256(image_data).hexdigest()[:8]
filename = f"image_{hashname}{ext}"
# Save file
filepath = os.path.join(output_dir, filename)
with open(filepath, "wb") as f:
f.write(image_data)
# Update image src in HTML
new_src = os.path.join(output_dir, filename).replace("\\", "/")
img["src"] = new_src
# Add alt text if empty
if not img.get("alt"):
img["alt"] = f"image_{hashname}"
except Exception as e:
continue
# Return updated HTML
return str(soup)
def convert(
self,
file_stream: BinaryIO,
@ -132,27 +179,37 @@ class DocxConverter(HtmlConverter):
_dependency_exc_info[2]
)
# If conversion_name not explicitly provided, try to extract from stream_info
if "conversion_name" not in kwargs:
conversion_name = self._get_document_name(stream_info)
kwargs["conversion_name"] = conversion_name
# Get document name
doc_name = kwargs.get("conversion_name") or self._get_document_name(stream_info)
if hasattr(self, "sanitize_filename"):
doc_name = self.sanitize_filename(doc_name)
# Get assets folder
assets_folder = kwargs.get("image_output_dir", "assets")
# Convert DOCX to HTML
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
html_content = mammoth.convert_to_html(
pre_process_stream, style_map=style_map
).value
# Convert to HTML and pass necessary parameters to HTML converter
html_content = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
# Extract and save images, getting updated HTML with correct image references
processed_html = self._extract_and_save_images(
html_content, doc_name, assets_folder
)
# Create new StreamInfo to pass to HTML converter
# Create a new StreamInfo for the HTML converter
html_stream_info = stream_info.copy_and_update(
mimetype="text/html",
extension=".html"
mimetype="text/html", extension=".html"
)
# Use io.BytesIO to create binary stream
from io import BytesIO
# Use the standard HTML converter to convert to Markdown
# We don't need to pass conversion_name because images are already extracted
html_kwargs = {k: v for k, v in kwargs.items() if k != "conversion_name"}
return self._html_converter.convert(
file_stream=BytesIO(html_content.encode("utf-8")),
file_stream=BytesIO(processed_html.encode("utf-8")),
stream_info=html_stream_info,
**kwargs,
**html_kwargs,
)

View file

@ -1,9 +1,5 @@
import re
import markdownify
import os
import base64
import hashlib
import sys
from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse
@ -20,15 +16,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
"""
def __init__(self, **options: Any):
# Set default values for image-related options
self.image_output_dir = options.get("image_output_dir", "assets")
self.conversion_name = options.get("conversion_name")
# Apply basic options
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
# Initialize parent class
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
def convert_hn(
@ -99,81 +89,23 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""
Process image elements, save data URI format images to filesystem
Supports categorized storage in subfolders by document name
"""
"""Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or ""
src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
src = el.attrs.get("src", None) or ""
title = el.attrs.get("title", None) or ""
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
# If in inline mode and not preserved, return alt text
if (
convert_as_inline
and el.parent.name not in self.options.get("keep_inline_images_in", [])
and el.parent.name not in self.options["keep_inline_images_in"]
):
return alt
# Process data URI format images
if src.startswith("data:image") and not self.options.get("keep_data_uris", False):
try:
# Parse MIME type
mime_type = src.split(";")[0].replace("data:", "")
# Get file extension
ext = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/gif": ".gif"
}.get(mime_type, ".png")
# Decode base64 data
encoded = src.split(",")[1]
image_data = base64.b64decode(encoded)
# Generate unique filename
hashname = hashlib.sha256(image_data).hexdigest()[:8]
filename = f"image_{hashname}{ext}"
# Determine output directory
if hasattr(self, 'conversion_name') and self.conversion_name:
# If conversion_name exists, create subfolder
output_dir = os.path.join(self.image_output_dir, self.conversion_name)
else:
# Otherwise use base directory
output_dir = self.image_output_dir
# Ensure directory exists
os.makedirs(output_dir, exist_ok=True)
# Save image file
filepath = os.path.join(output_dir, filename)
with open(filepath, "wb") as f:
f.write(image_data)
# Update src to relative path
src = os.path.join(output_dir, filename).replace("\\", "/")
# If alt text is empty, use the image filename (without extension) as alt text
if not alt:
alt = f"image_{hashname}"
except Exception as e:
error_msg = f"Error saving image: {str(e)}"
import traceback
traceback.print_exc(file=sys.stderr)
# If extraction fails, revert to original truncating behavior
src = src.split(",")[0] + "..."
return f"![{alt}](image_error.png) <!-- {error_msg} -->"
# Process other data URIs that are not images (truncate them)
elif src.startswith("data:") and not self.options.get("keep_data_uris", False):
# Remove dataURIs
if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."
# Return Markdown format image reference
return f"![{alt}]({src}{title_part})"
return "![%s](%s%s)" % (alt, src, title_part)
def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore