pre-commit

This commit is contained in:
朱昊天 2025-04-30 15:21:41 +12:00
parent 92f427477a
commit d8a8cda4db
2 changed files with 46 additions and 40 deletions

View file

@ -54,34 +54,34 @@ class DocxConverter(HtmlConverter):
return True return True
return False return False
def _sanitize_filename(self, filename: str) -> str: def _sanitize_filename(self, filename: str) -> str:
""" """
Sanitize a filename by removing or replacing problematic characters. Sanitize a filename by removing or replacing problematic characters.
Args: Args:
filename: The original filename filename: The original filename
Returns: Returns:
A sanitized filename safe for filesystem use A sanitized filename safe for filesystem use
""" """
# Step 1: Normalize unicode characters # Step 1: Normalize unicode characters
filename = unicodedata.normalize('NFKD', filename) filename = unicodedata.normalize("NFKD", filename)
# Step 2: Remove invalid characters and replace spaces with underscores # Step 2: Remove invalid characters and replace spaces with underscores
# Keep alphanumeric characters, underscores, hyphens, and periods # Keep alphanumeric characters, underscores, hyphens, and periods
sanitized = re.sub(r'[^\w\-\.]', '_', filename) sanitized = re.sub(r"[^\w\-\.]", "_", filename)
# Step 3: Collapse multiple underscores # Step 3: Collapse multiple underscores
sanitized = re.sub(r'_+', '_', sanitized) sanitized = re.sub(r"_+", "_", sanitized)
# Step 4: Remove leading/trailing underscores # Step 4: Remove leading/trailing underscores
sanitized = sanitized.strip('_') sanitized = sanitized.strip("_")
# Step 5: Ensure we have a valid filename (default if empty) # Step 5: Ensure we have a valid filename (default if empty)
if not sanitized: if not sanitized:
sanitized = "unnamed" sanitized = "unnamed"
return sanitized return sanitized
def _get_document_name(self, stream_info: StreamInfo) -> str: def _get_document_name(self, stream_info: StreamInfo) -> str:
@ -94,21 +94,21 @@ class DocxConverter(HtmlConverter):
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
if name: if name:
return self._sanitize_filename(name) return self._sanitize_filename(name)
# If local_path exists, try to extract from local path # If local_path exists, try to extract from local path
if stream_info.local_path: if stream_info.local_path:
basename = os.path.basename(stream_info.local_path) basename = os.path.basename(stream_info.local_path)
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
if name: if name:
return self._sanitize_filename(name) return self._sanitize_filename(name)
# If URL exists, try to extract from URL # If URL exists, try to extract from URL
if stream_info.url: if stream_info.url:
basename = os.path.basename(stream_info.url) basename = os.path.basename(stream_info.url)
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
if name: if name:
return self._sanitize_filename(name) return self._sanitize_filename(name)
# Default name # Default name
return "docx_document" return "docx_document"
@ -117,7 +117,7 @@ class DocxConverter(HtmlConverter):
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Check dependencies # Check dependencies
if _dependency_exc_info is not None: if _dependency_exc_info is not None:
raise MissingDependencyException( raise MissingDependencyException(
@ -139,20 +139,22 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream) pre_process_stream = pre_process_docx(file_stream)
# Convert to HTML and pass necessary parameters to HTML converter # Convert to HTML and pass necessary parameters to HTML converter
html_content = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value html_content = mammoth.convert_to_html(
pre_process_stream, style_map=style_map
).value
# Create new StreamInfo to pass to HTML converter # Create new StreamInfo to pass to HTML converter
html_stream_info = stream_info.copy_and_update( html_stream_info = stream_info.copy_and_update(
mimetype="text/html", mimetype="text/html", extension=".html"
extension=".html"
) )
# Use io.BytesIO to create binary stream # Use io.BytesIO to create binary stream
from io import BytesIO from io import BytesIO
return self._html_converter.convert( return self._html_converter.convert(
file_stream=BytesIO(html_content.encode("utf-8")), file_stream=BytesIO(html_content.encode("utf-8")),
stream_info=html_stream_info, stream_info=html_stream_info,
**kwargs, **kwargs,
) )

View file

@ -27,7 +27,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Apply basic options # Apply basic options
options["heading_style"] = options.get("heading_style", markdownify.ATX) options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False) options["keep_data_uris"] = options.get("keep_data_uris", False)
# Initialize parent class # Initialize parent class
super().__init__(**options) super().__init__(**options)
@ -107,62 +107,66 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or "" src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
title = el.attrs.get("title", None) or "" title = el.attrs.get("title", None) or ""
title_part = ' "%s"' % title.replace('"', r"\"") if title else "" title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
# If in inline mode and not preserved, return alt text # If in inline mode and not preserved, return alt text
if ( if convert_as_inline and el.parent.name not in self.options.get(
convert_as_inline "keep_inline_images_in", []
and el.parent.name not in self.options.get("keep_inline_images_in", [])
): ):
return alt return alt
# Process data URI format images # Process data URI format images
if src.startswith("data:image") and not self.options.get("keep_data_uris", False): if src.startswith("data:image") and not self.options.get(
"keep_data_uris", False
):
try: try:
# Parse MIME type # Parse MIME type
mime_type = src.split(";")[0].replace("data:", "") mime_type = src.split(";")[0].replace("data:", "")
# Get file extension # Get file extension
ext = { ext = {
"image/png": ".png", "image/png": ".png",
"image/jpeg": ".jpg", "image/jpeg": ".jpg",
"image/jpg": ".jpg", "image/jpg": ".jpg",
"image/gif": ".gif" "image/gif": ".gif",
}.get(mime_type, ".png") }.get(mime_type, ".png")
# Decode base64 data # Decode base64 data
encoded = src.split(",")[1] encoded = src.split(",")[1]
image_data = base64.b64decode(encoded) image_data = base64.b64decode(encoded)
# Generate unique filename # Generate unique filename
hashname = hashlib.sha256(image_data).hexdigest()[:8] hashname = hashlib.sha256(image_data).hexdigest()[:8]
filename = f"image_{hashname}{ext}" filename = f"image_{hashname}{ext}"
# Determine output directory # Determine output directory
if hasattr(self, 'conversion_name') and self.conversion_name: if hasattr(self, "conversion_name") and self.conversion_name:
# If conversion_name exists, create subfolder # If conversion_name exists, create subfolder
output_dir = os.path.join(self.image_output_dir, self.conversion_name) output_dir = os.path.join(
self.image_output_dir, self.conversion_name
)
else: else:
# Otherwise use base directory # Otherwise use base directory
output_dir = self.image_output_dir output_dir = self.image_output_dir
# Ensure directory exists # Ensure directory exists
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
# Save image file # Save image file
filepath = os.path.join(output_dir, filename) filepath = os.path.join(output_dir, filename)
with open(filepath, "wb") as f: with open(filepath, "wb") as f:
f.write(image_data) f.write(image_data)
# Update src to relative path # Update src to relative path
src = os.path.join(output_dir, filename).replace("\\", "/") src = os.path.join(output_dir, filename).replace("\\", "/")
# If alt text is empty, use the image filename (without extension) as alt text # If alt text is empty, use the image filename (without extension) as alt text
if not alt: if not alt:
alt = f"image_{hashname}" alt = f"image_{hashname}"
except Exception as e: except Exception as e:
error_msg = f"Error saving image: {str(e)}" error_msg = f"Error saving image: {str(e)}"
import traceback import traceback
traceback.print_exc(file=sys.stderr) traceback.print_exc(file=sys.stderr)
# If extraction fails, revert to original truncating behavior # If extraction fails, revert to original truncating behavior
src = src.split(",")[0] + "..." src = src.split(",")[0] + "..."