Added .docx to optional dependencies

This commit is contained in:
Adam Fourney 2025-02-28 17:06:59 -08:00
parent b9487b6b6d
commit 98698a64ce
3 changed files with 31 additions and 5 deletions

View file

@ -26,7 +26,6 @@ classifiers = [
dependencies = [
"beautifulsoup4",
"requests",
"mammoth",
"markdownify~=0.14.1",
"numpy",
"pandas",
@ -46,8 +45,12 @@ dependencies = [
]
[project.optional-dependencies]
all = ["python-pptx"]
all = [
"python-pptx",
"mammoth"
]
pptx = ["python-pptx"]
docx = ["mammoth"]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -1,6 +1,6 @@
from typing import Union
import sys
import mammoth
from typing import Union
from ._base import (
DocumentConverterResult,
@ -8,6 +8,16 @@ from ._base import (
from ._base import DocumentConverter
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import mammoth
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class DocxConverter(HtmlConverter):
@ -26,6 +36,19 @@ class DocxConverter(HtmlConverter):
if extension.lower() != ".docx":
return None
# Load the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
f"""{type(self).__name__} recognized the input as a potential .docx file, but the dependencies needed to read .docx files have not been installed. To resolve this error, include the optional dependency [docx] or [all] when installing MarkItDown. For example:
* pip install markitdown[docx]
* pip install markitdown[all]
* pip install markitdown[pptx, docx, ...]
* etc."""
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)

View file

@ -67,7 +67,7 @@ class PptxConverter(HtmlConverter):
# Load the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
f"""{type(self).__name__} recognized the input as a .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example:
f"""{type(self).__name__} recognized the input as a potential .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example:
* pip install markitdown[pptx]
* pip install markitdown[all]