From dcf48f042d837242a1df1d04d4d6c719ce9d341e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=98=8A=E5=A4=A9?= Date: Wed, 30 Apr 2025 17:41:39 +1200 Subject: [PATCH] Convert file name without symbols --- .../markitdown/converters/_docx_converter.py | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 1fcff58..6f8650b 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -3,6 +3,7 @@ import os import re import base64 import hashlib +import unicodedata from typing import BinaryIO, Any, Dict, List, Tuple from io import BytesIO import json @@ -59,6 +60,35 @@ class DocxConverter(HtmlConverter): return False + def _sanitize_filename(self, filename: str) -> str: + """ + Sanitize a filename by removing or replacing problematic characters. + + Args: + filename: The original filename + + Returns: + A sanitized filename safe for filesystem use + """ + # Step 1: Normalize unicode characters + filename = unicodedata.normalize("NFKD", filename) + + # Step 2: Remove invalid characters and replace spaces with underscores + # Keep alphanumeric characters, underscores, hyphens, and periods + sanitized = re.sub(r"[^\w\-\.]", "_", filename) + + # Step 3: Collapse multiple underscores + sanitized = re.sub(r"_+", "_", sanitized) + + # Step 4: Remove leading/trailing underscores + sanitized = sanitized.strip("_") + + # Step 5: Ensure we have a valid filename (default if empty) + if not sanitized: + sanitized = "unnamed" + + return sanitized + def _get_document_name(self, stream_info: StreamInfo) -> str: """ Extract document name from StreamInfo and sanitize it @@ -68,7 +98,7 @@ class DocxConverter(HtmlConverter): basename = os.path.basename(stream_info.filename) name, _ = os.path.splitext(basename) if name: - return name + return self._sanitize_filename(name) # If local_path exists, try to extract from local path if stream_info.local_path: @@ -77,14 +107,6 @@ class DocxConverter(HtmlConverter): if name: return name - # If URL exists, try to extract from URL - if stream_info.url: - basename = os.path.basename(stream_info.url) - name, _ = os.path.splitext(basename) - if name: - print(f"[DEBUG] Extracted document name from URL: {name}") - return name - # Default name return "docx_document"