From 98698a64ce24a1d74957a4b59aef1797f2f239bf Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Fri, 28 Feb 2025 17:06:59 -0800 Subject: [PATCH] Added .docx to optional dependencies --- packages/markitdown/pyproject.toml | 7 +++-- .../markitdown/converters/_docx_converter.py | 27 +++++++++++++++++-- .../markitdown/converters/_pptx_converter.py | 2 +- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 3343eae..5e56642 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -26,7 +26,6 @@ classifiers = [ dependencies = [ "beautifulsoup4", "requests", - "mammoth", "markdownify~=0.14.1", "numpy", "pandas", @@ -46,8 +45,12 @@ dependencies = [ ] [project.optional-dependencies] -all = ["python-pptx"] +all = [ + "python-pptx", + "mammoth" +] pptx = ["python-pptx"] +docx = ["mammoth"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 8515f6d..b68500f 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,6 +1,6 @@ -from typing import Union +import sys -import mammoth +from typing import Union from ._base import ( DocumentConverterResult, @@ -8,6 +8,16 @@ from ._base import ( from ._base import DocumentConverter from ._html_converter import HtmlConverter +from .._exceptions import MissingDependencyException + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import mammoth +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() class DocxConverter(HtmlConverter): @@ -26,6 +36,19 @@ class DocxConverter(HtmlConverter): if extension.lower() != ".docx": return None + # Load the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + f"""{type(self).__name__} recognized the input as a potential .docx file, but the dependencies needed to read .docx files have not been installed. To resolve this error, include the optional dependency [docx] or [all] when installing MarkItDown. For example: + +* pip install markitdown[docx] +* pip install markitdown[all] +* pip install markitdown[pptx, docx, ...] +* etc.""" + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # Restore the original traceback + result = None with open(local_path, "rb") as docx_file: style_map = kwargs.get("style_map", None) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 34d4fbd..7d72c1b 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -67,7 +67,7 @@ class PptxConverter(HtmlConverter): # Load the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( - f"""{type(self).__name__} recognized the input as a .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example: + f"""{type(self).__name__} recognized the input as a potential .pptx file, but the dependencies needed to read .pptx files have not been installed. To resolve this error, include the optional dependency [pptx] or [all] when installing MarkItDown. For example: * pip install markitdown[pptx] * pip install markitdown[all]