Merge pull request #2 from Noah-Zhuhaotian/dev

Convert file name without symbols
This commit is contained in:
Noah Zhu 2025-04-30 17:42:22 +12:00 committed by GitHub
commit 694b8a05b0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -3,6 +3,7 @@ import os
import re
import base64
import hashlib
import unicodedata
from typing import BinaryIO, Any, Dict, List, Tuple
from io import BytesIO
import json
@ -59,6 +60,35 @@ class DocxConverter(HtmlConverter):
return False
def _sanitize_filename(self, filename: str) -> str:
"""
Sanitize a filename by removing or replacing problematic characters.
Args:
filename: The original filename
Returns:
A sanitized filename safe for filesystem use
"""
# Step 1: Normalize unicode characters
filename = unicodedata.normalize("NFKD", filename)
# Step 2: Remove invalid characters and replace spaces with underscores
# Keep alphanumeric characters, underscores, hyphens, and periods
sanitized = re.sub(r"[^\w\-\.]", "_", filename)
# Step 3: Collapse multiple underscores
sanitized = re.sub(r"_+", "_", sanitized)
# Step 4: Remove leading/trailing underscores
sanitized = sanitized.strip("_")
# Step 5: Ensure we have a valid filename (default if empty)
if not sanitized:
sanitized = "unnamed"
return sanitized
def _get_document_name(self, stream_info: StreamInfo) -> str:
"""
Extract document name from StreamInfo and sanitize it
@ -68,7 +98,7 @@ class DocxConverter(HtmlConverter):
basename = os.path.basename(stream_info.filename)
name, _ = os.path.splitext(basename)
if name:
return name
return self._sanitize_filename(name)
# If local_path exists, try to extract from local path
if stream_info.local_path:
@ -77,14 +107,6 @@ class DocxConverter(HtmlConverter):
if name:
return name
# If URL exists, try to extract from URL
if stream_info.url:
basename = os.path.basename(stream_info.url)
name, _ = os.path.splitext(basename)
if name:
print(f"[DEBUG] Extracted document name from URL: {name}")
return name
# Default name
return "docx_document"