Pre-commit
This commit is contained in:
parent
3a6f023f0b
commit
1b0d1491be
2 changed files with 8 additions and 6 deletions
|
|
@ -705,18 +705,18 @@ class DocxConverter(HtmlConverter):
|
||||||
"""Sanitizes a string to make it a valid file name across different operating systems."""
|
"""Sanitizes a string to make it a valid file name across different operating systems."""
|
||||||
# Normalize underscore
|
# Normalize underscore
|
||||||
name = re.sub(r"\s+", "_", name.strip())
|
name = re.sub(r"\s+", "_", name.strip())
|
||||||
|
|
||||||
# Replace invalid characters with underscores
|
# Replace invalid characters with underscores
|
||||||
name = re.sub(r'[\\/*?:"<>|]', "_", name)
|
name = re.sub(r'[\\/*?:"<>|]', "_", name)
|
||||||
|
|
||||||
# Remove leading and trailing dots and spaces
|
# Remove leading and trailing dots and spaces
|
||||||
name = name.strip(" .")
|
name = name.strip(" .")
|
||||||
|
|
||||||
# Limit the length of the filename to a reasonable length (e.g., 251 characters)
|
# Limit the length of the filename to a reasonable length (e.g., 251 characters)
|
||||||
max_length = 251
|
max_length = 251
|
||||||
if len(name) > max_length:
|
if len(name) > max_length:
|
||||||
name = name[:max_length]
|
name = name[:max_length]
|
||||||
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str:
|
def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str:
|
||||||
|
|
@ -736,7 +736,9 @@ class DocxConverter(HtmlConverter):
|
||||||
while os.path.exists(unique_path):
|
while os.path.exists(unique_path):
|
||||||
suffix = f"_{counter}"
|
suffix = f"_{counter}"
|
||||||
# Ensure base is short enough to add the suffix
|
# Ensure base is short enough to add the suffix
|
||||||
truncated_base = self.truncate_filename(base, max_length - len(suffix) - len(ext))
|
truncated_base = self.truncate_filename(
|
||||||
|
base, max_length - len(suffix) - len(ext)
|
||||||
|
)
|
||||||
unique_path = f"{truncated_base}{suffix}{ext}"
|
unique_path = f"{truncated_base}{suffix}{ext}"
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -96,7 +96,7 @@ DOCX_IMAGES_TEST_STRINGS = [
|
||||||
"# Abstract",
|
"# Abstract",
|
||||||
"# Introduction",
|
"# Introduction",
|
||||||
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||||||
"GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png"
|
"GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png",
|
||||||
]
|
]
|
||||||
|
|
||||||
PPTX_TEST_STRINGS = [
|
PPTX_TEST_STRINGS = [
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue