Added xlsx and xls

2025-02-28 21:21:17 -08:00 · 2025-02-28 21:21:17 -08:00 · 8362df8e60
commit 8362df8e60
parent e5dc512948
3 changed files with 60 additions and 6 deletions
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@ -28,9 +28,6 @@ dependencies = [
  "requests",
  "markdownify~=0.14.1",
  "numpy",
  "pandas",
  "openpyxl",
  "xlrd",
  "pdfminer.six",
  "puremagic",
  "pydub",
@ -47,10 +44,15 @@ dependencies = [
 [project.optional-dependencies]
 all = [
  "python-pptx",
-  "mammoth"
+  "mammoth",
  "pandas",
  "openpyxl",
  "xlrd"
 ]
 pptx = ["python-pptx"]
 docx = ["mammoth"]
 xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]
 [project.urls]
 Documentation = "https://github.com/microsoft/markitdown#readme"
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -6,6 +6,13 @@ from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult
 # Mimetypes to ignore (commonly confused extensions)
 IGNORE_MIMETYPES = [
    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
 ]
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
            "__placeholder" + kwargs.get("file_extension", "")
        )
        # Ignore common false positives
        if content_type in IGNORE_MIMETYPES:
            content_type = None
        # Only accept text files
        if content_type is None:
            return None
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -1,9 +1,26 @@
-from typing import Union
+import sys
-import pandas as pd
+from typing import Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
 _xlsx_dependency_exc_info = None
 try:
    import pandas as pd
    import openpyxl
 except ImportError:
    _xlsx_dependency_exc_info = sys.exc_info()
 _xls_dependency_exc_info = None
 try:
    import pandas as pd
    import xlrd
 except ImportError:
    _xls_dependency_exc_info = sys.exc_info()
 class XlsxConverter(HtmlConverter):
@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
        if extension.lower() != ".xlsx":
            return None
        # Load the dependencies
        if _xlsx_dependency_exc_info is not None:
            raise MissingDependencyException(
                MISSING_DEPENDENCY_MESSAGE.format(
                    converter=type(self).__name__,
                    extension=".xlsx",
                    feature="xlsx",
                )
            ) from _xlsx_dependency_exc_info[1].with_traceback(
                _xlsx_dependency_exc_info[2]
            )  # Restore the original traceback
        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
        md_content = ""
        for s in sheets:
@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
        if extension.lower() != ".xls":
            return None
        # Load the dependencies
        if _xls_dependency_exc_info is not None:
            raise MissingDependencyException(
                MISSING_DEPENDENCY_MESSAGE.format(
                    converter=type(self).__name__,
                    extension=".xls",
                    feature="xls",
                )
            ) from _xls_dependency_exc_info[1].with_traceback(
                _xls_dependency_exc_info[2]
            )  # Restore the original traceback
        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
        md_content = ""
        for s in sheets: