Convert file name without symbols

This commit is contained in:
朱昊天 2025-04-30 17:41:39 +12:00
parent b7f645ed2c
commit dcf48f042d

View file

@ -3,6 +3,7 @@ import os
import re import re
import base64 import base64
import hashlib import hashlib
import unicodedata
from typing import BinaryIO, Any, Dict, List, Tuple from typing import BinaryIO, Any, Dict, List, Tuple
from io import BytesIO from io import BytesIO
import json import json
@ -59,6 +60,35 @@ class DocxConverter(HtmlConverter):
return False return False
def _sanitize_filename(self, filename: str) -> str:
"""
Sanitize a filename by removing or replacing problematic characters.
Args:
filename: The original filename
Returns:
A sanitized filename safe for filesystem use
"""
# Step 1: Normalize unicode characters
filename = unicodedata.normalize("NFKD", filename)
# Step 2: Remove invalid characters and replace spaces with underscores
# Keep alphanumeric characters, underscores, hyphens, and periods
sanitized = re.sub(r"[^\w\-\.]", "_", filename)
# Step 3: Collapse multiple underscores
sanitized = re.sub(r"_+", "_", sanitized)
# Step 4: Remove leading/trailing underscores
sanitized = sanitized.strip("_")
# Step 5: Ensure we have a valid filename (default if empty)
if not sanitized:
sanitized = "unnamed"
return sanitized
def _get_document_name(self, stream_info: StreamInfo) -> str: def _get_document_name(self, stream_info: StreamInfo) -> str:
""" """
Extract document name from StreamInfo and sanitize it Extract document name from StreamInfo and sanitize it
@ -68,7 +98,7 @@ class DocxConverter(HtmlConverter):
basename = os.path.basename(stream_info.filename) basename = os.path.basename(stream_info.filename)
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
if name: if name:
return name return self._sanitize_filename(name)
# If local_path exists, try to extract from local path # If local_path exists, try to extract from local path
if stream_info.local_path: if stream_info.local_path:
@ -77,14 +107,6 @@ class DocxConverter(HtmlConverter):
if name: if name:
return name return name
# If URL exists, try to extract from URL
if stream_info.url:
basename = os.path.basename(stream_info.url)
name, _ = os.path.splitext(basename)
if name:
print(f"[DEBUG] Extracted document name from URL: {name}")
return name
# Default name # Default name
return "docx_document" return "docx_document"