pre-commit
This commit is contained in:
parent
92f427477a
commit
d8a8cda4db
2 changed files with 46 additions and 40 deletions
|
|
@ -66,17 +66,17 @@ class DocxConverter(HtmlConverter):
|
||||||
A sanitized filename safe for filesystem use
|
A sanitized filename safe for filesystem use
|
||||||
"""
|
"""
|
||||||
# Step 1: Normalize unicode characters
|
# Step 1: Normalize unicode characters
|
||||||
filename = unicodedata.normalize('NFKD', filename)
|
filename = unicodedata.normalize("NFKD", filename)
|
||||||
|
|
||||||
# Step 2: Remove invalid characters and replace spaces with underscores
|
# Step 2: Remove invalid characters and replace spaces with underscores
|
||||||
# Keep alphanumeric characters, underscores, hyphens, and periods
|
# Keep alphanumeric characters, underscores, hyphens, and periods
|
||||||
sanitized = re.sub(r'[^\w\-\.]', '_', filename)
|
sanitized = re.sub(r"[^\w\-\.]", "_", filename)
|
||||||
|
|
||||||
# Step 3: Collapse multiple underscores
|
# Step 3: Collapse multiple underscores
|
||||||
sanitized = re.sub(r'_+', '_', sanitized)
|
sanitized = re.sub(r"_+", "_", sanitized)
|
||||||
|
|
||||||
# Step 4: Remove leading/trailing underscores
|
# Step 4: Remove leading/trailing underscores
|
||||||
sanitized = sanitized.strip('_')
|
sanitized = sanitized.strip("_")
|
||||||
|
|
||||||
# Step 5: Ensure we have a valid filename (default if empty)
|
# Step 5: Ensure we have a valid filename (default if empty)
|
||||||
if not sanitized:
|
if not sanitized:
|
||||||
|
|
@ -141,16 +141,18 @@ class DocxConverter(HtmlConverter):
|
||||||
pre_process_stream = pre_process_docx(file_stream)
|
pre_process_stream = pre_process_docx(file_stream)
|
||||||
|
|
||||||
# Convert to HTML and pass necessary parameters to HTML converter
|
# Convert to HTML and pass necessary parameters to HTML converter
|
||||||
html_content = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
|
html_content = mammoth.convert_to_html(
|
||||||
|
pre_process_stream, style_map=style_map
|
||||||
|
).value
|
||||||
|
|
||||||
# Create new StreamInfo to pass to HTML converter
|
# Create new StreamInfo to pass to HTML converter
|
||||||
html_stream_info = stream_info.copy_and_update(
|
html_stream_info = stream_info.copy_and_update(
|
||||||
mimetype="text/html",
|
mimetype="text/html", extension=".html"
|
||||||
extension=".html"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use io.BytesIO to create binary stream
|
# Use io.BytesIO to create binary stream
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
return self._html_converter.convert(
|
return self._html_converter.convert(
|
||||||
file_stream=BytesIO(html_content.encode("utf-8")),
|
file_stream=BytesIO(html_content.encode("utf-8")),
|
||||||
stream_info=html_stream_info,
|
stream_info=html_stream_info,
|
||||||
|
|
|
||||||
|
|
@ -109,14 +109,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
|
|
||||||
# If in inline mode and not preserved, return alt text
|
# If in inline mode and not preserved, return alt text
|
||||||
if (
|
if convert_as_inline and el.parent.name not in self.options.get(
|
||||||
convert_as_inline
|
"keep_inline_images_in", []
|
||||||
and el.parent.name not in self.options.get("keep_inline_images_in", [])
|
|
||||||
):
|
):
|
||||||
return alt
|
return alt
|
||||||
|
|
||||||
# Process data URI format images
|
# Process data URI format images
|
||||||
if src.startswith("data:image") and not self.options.get("keep_data_uris", False):
|
if src.startswith("data:image") and not self.options.get(
|
||||||
|
"keep_data_uris", False
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
# Parse MIME type
|
# Parse MIME type
|
||||||
mime_type = src.split(";")[0].replace("data:", "")
|
mime_type = src.split(";")[0].replace("data:", "")
|
||||||
|
|
@ -126,7 +127,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
"image/png": ".png",
|
"image/png": ".png",
|
||||||
"image/jpeg": ".jpg",
|
"image/jpeg": ".jpg",
|
||||||
"image/jpg": ".jpg",
|
"image/jpg": ".jpg",
|
||||||
"image/gif": ".gif"
|
"image/gif": ".gif",
|
||||||
}.get(mime_type, ".png")
|
}.get(mime_type, ".png")
|
||||||
|
|
||||||
# Decode base64 data
|
# Decode base64 data
|
||||||
|
|
@ -138,9 +139,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
filename = f"image_{hashname}{ext}"
|
filename = f"image_{hashname}{ext}"
|
||||||
|
|
||||||
# Determine output directory
|
# Determine output directory
|
||||||
if hasattr(self, 'conversion_name') and self.conversion_name:
|
if hasattr(self, "conversion_name") and self.conversion_name:
|
||||||
# If conversion_name exists, create subfolder
|
# If conversion_name exists, create subfolder
|
||||||
output_dir = os.path.join(self.image_output_dir, self.conversion_name)
|
output_dir = os.path.join(
|
||||||
|
self.image_output_dir, self.conversion_name
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise use base directory
|
# Otherwise use base directory
|
||||||
output_dir = self.image_output_dir
|
output_dir = self.image_output_dir
|
||||||
|
|
@ -163,6 +166,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Error saving image: {str(e)}"
|
error_msg = f"Error saving image: {str(e)}"
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_exc(file=sys.stderr)
|
traceback.print_exc(file=sys.stderr)
|
||||||
# If extraction fails, revert to original truncating behavior
|
# If extraction fails, revert to original truncating behavior
|
||||||
src = src.split(",")[0] + "..."
|
src = src.split(",")[0] + "..."
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue