Merge pull request #2 from Noah-Zhuhaotian/dev

Convert file name without symbols
This commit is contained in:
Noah Zhu 2025-04-30 17:42:22 +12:00 committed by GitHub
commit 694b8a05b0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -3,6 +3,7 @@ import os
import re import re
import base64 import base64
import hashlib import hashlib
import unicodedata
from typing import BinaryIO, Any, Dict, List, Tuple from typing import BinaryIO, Any, Dict, List, Tuple
from io import BytesIO from io import BytesIO
import json import json
@ -59,6 +60,35 @@ class DocxConverter(HtmlConverter):
return False return False
def _sanitize_filename(self, filename: str) -> str:
"""
Sanitize a filename by removing or replacing problematic characters.
Args:
filename: The original filename
Returns:
A sanitized filename safe for filesystem use
"""
# Step 1: Normalize unicode characters
filename = unicodedata.normalize("NFKD", filename)
# Step 2: Remove invalid characters and replace spaces with underscores
# Keep alphanumeric characters, underscores, hyphens, and periods
sanitized = re.sub(r"[^\w\-\.]", "_", filename)
# Step 3: Collapse multiple underscores
sanitized = re.sub(r"_+", "_", sanitized)
# Step 4: Remove leading/trailing underscores
sanitized = sanitized.strip("_")
# Step 5: Ensure we have a valid filename (default if empty)
if not sanitized:
sanitized = "unnamed"
return sanitized
def _get_document_name(self, stream_info: StreamInfo) -> str: def _get_document_name(self, stream_info: StreamInfo) -> str:
""" """
Extract document name from StreamInfo and sanitize it Extract document name from StreamInfo and sanitize it
@ -68,7 +98,7 @@ class DocxConverter(HtmlConverter):
basename = os.path.basename(stream_info.filename) basename = os.path.basename(stream_info.filename)
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
if name: if name:
return name return self._sanitize_filename(name)
# If local_path exists, try to extract from local path # If local_path exists, try to extract from local path
if stream_info.local_path: if stream_info.local_path:
@ -77,14 +107,6 @@ class DocxConverter(HtmlConverter):
if name: if name:
return name return name
# If URL exists, try to extract from URL
if stream_info.url:
basename = os.path.basename(stream_info.url)
name, _ = os.path.splitext(basename)
if name:
print(f"[DEBUG] Extracted document name from URL: {name}")
return name
# Default name # Default name
return "docx_document" return "docx_document"