From 8362df8e60e09711fe475d2554c8b137c14ef7a8 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Fri, 28 Feb 2025 21:21:17 -0800 Subject: [PATCH] Added xlsx and xls --- packages/markitdown/pyproject.toml | 10 +++-- .../converters/_plain_text_converter.py | 11 +++++ .../markitdown/converters/_xlsx_converter.py | 45 ++++++++++++++++++- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 5e56642..254fccb 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -28,9 +28,6 @@ dependencies = [ "requests", "markdownify~=0.14.1", "numpy", - "pandas", - "openpyxl", - "xlrd", "pdfminer.six", "puremagic", "pydub", @@ -47,10 +44,15 @@ dependencies = [ [project.optional-dependencies] all = [ "python-pptx", - "mammoth" + "mammoth", + "pandas", + "openpyxl", + "xlrd" ] pptx = ["python-pptx"] docx = ["mammoth"] +xlsx = ["pandas", "openpyxl"] +xls = ["pandas", "xlrd"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 75f74a8..b4c9282 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -6,6 +6,13 @@ from typing import Any, Union from ._base import DocumentConverter, DocumentConverterResult +# Mimetypes to ignore (commonly confused extensions) +IGNORE_MIMETYPES = [ + "text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc. + "text/vnd.graphviz", # .dot which is confused with xls, doc, etc. +] + + class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" @@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter): "__placeholder" + kwargs.get("file_extension", "") ) + # Ignore common false positives + if content_type in IGNORE_MIMETYPES: + content_type = None + # Only accept text files if content_type is None: return None diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 2bdfd5d..61c7e2c 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,9 +1,26 @@ -from typing import Union +import sys -import pandas as pd +from typing import Union from ._base import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_xlsx_dependency_exc_info = None +try: + import pandas as pd + import openpyxl +except ImportError: + _xlsx_dependency_exc_info = sys.exc_info() + +_xls_dependency_exc_info = None +try: + import pandas as pd + import xlrd +except ImportError: + _xls_dependency_exc_info = sys.exc_info() class XlsxConverter(HtmlConverter): @@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter): if extension.lower() != ".xlsx": return None + # Load the dependencies + if _xlsx_dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".xlsx", + feature="xlsx", + ) + ) from _xlsx_dependency_exc_info[1].with_traceback( + _xlsx_dependency_exc_info[2] + ) # Restore the original traceback + sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") md_content = "" for s in sheets: @@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter): if extension.lower() != ".xls": return None + # Load the dependencies + if _xls_dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".xls", + feature="xls", + ) + ) from _xls_dependency_exc_info[1].with_traceback( + _xls_dependency_exc_info[2] + ) # Restore the original traceback + sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: