Merge pull request #2 from Noah-Zhuhaotian/dev
Convert file name without symbols
This commit is contained in:
commit
694b8a05b0
1 changed files with 31 additions and 9 deletions
|
|
@ -3,6 +3,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import base64
|
import base64
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import unicodedata
|
||||||
from typing import BinaryIO, Any, Dict, List, Tuple
|
from typing import BinaryIO, Any, Dict, List, Tuple
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import json
|
import json
|
||||||
|
|
@ -59,6 +60,35 @@ class DocxConverter(HtmlConverter):
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _sanitize_filename(self, filename: str) -> str:
|
||||||
|
"""
|
||||||
|
Sanitize a filename by removing or replacing problematic characters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: The original filename
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A sanitized filename safe for filesystem use
|
||||||
|
"""
|
||||||
|
# Step 1: Normalize unicode characters
|
||||||
|
filename = unicodedata.normalize("NFKD", filename)
|
||||||
|
|
||||||
|
# Step 2: Remove invalid characters and replace spaces with underscores
|
||||||
|
# Keep alphanumeric characters, underscores, hyphens, and periods
|
||||||
|
sanitized = re.sub(r"[^\w\-\.]", "_", filename)
|
||||||
|
|
||||||
|
# Step 3: Collapse multiple underscores
|
||||||
|
sanitized = re.sub(r"_+", "_", sanitized)
|
||||||
|
|
||||||
|
# Step 4: Remove leading/trailing underscores
|
||||||
|
sanitized = sanitized.strip("_")
|
||||||
|
|
||||||
|
# Step 5: Ensure we have a valid filename (default if empty)
|
||||||
|
if not sanitized:
|
||||||
|
sanitized = "unnamed"
|
||||||
|
|
||||||
|
return sanitized
|
||||||
|
|
||||||
def _get_document_name(self, stream_info: StreamInfo) -> str:
|
def _get_document_name(self, stream_info: StreamInfo) -> str:
|
||||||
"""
|
"""
|
||||||
Extract document name from StreamInfo and sanitize it
|
Extract document name from StreamInfo and sanitize it
|
||||||
|
|
@ -68,7 +98,7 @@ class DocxConverter(HtmlConverter):
|
||||||
basename = os.path.basename(stream_info.filename)
|
basename = os.path.basename(stream_info.filename)
|
||||||
name, _ = os.path.splitext(basename)
|
name, _ = os.path.splitext(basename)
|
||||||
if name:
|
if name:
|
||||||
return name
|
return self._sanitize_filename(name)
|
||||||
|
|
||||||
# If local_path exists, try to extract from local path
|
# If local_path exists, try to extract from local path
|
||||||
if stream_info.local_path:
|
if stream_info.local_path:
|
||||||
|
|
@ -77,14 +107,6 @@ class DocxConverter(HtmlConverter):
|
||||||
if name:
|
if name:
|
||||||
return name
|
return name
|
||||||
|
|
||||||
# If URL exists, try to extract from URL
|
|
||||||
if stream_info.url:
|
|
||||||
basename = os.path.basename(stream_info.url)
|
|
||||||
name, _ = os.path.splitext(basename)
|
|
||||||
if name:
|
|
||||||
print(f"[DEBUG] Extracted document name from URL: {name}")
|
|
||||||
return name
|
|
||||||
|
|
||||||
# Default name
|
# Default name
|
||||||
return "docx_document"
|
return "docx_document"
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue