Added xlsx and xls

This commit is contained in:
Adam Fourney 2025-02-28 21:21:17 -08:00
parent e5dc512948
commit 8362df8e60
3 changed files with 60 additions and 6 deletions

View file

@ -28,9 +28,6 @@ dependencies = [
"requests", "requests",
"markdownify~=0.14.1", "markdownify~=0.14.1",
"numpy", "numpy",
"pandas",
"openpyxl",
"xlrd",
"pdfminer.six", "pdfminer.six",
"puremagic", "puremagic",
"pydub", "pydub",
@ -47,10 +44,15 @@ dependencies = [
[project.optional-dependencies] [project.optional-dependencies]
all = [ all = [
"python-pptx", "python-pptx",
"mammoth" "mammoth",
"pandas",
"openpyxl",
"xlrd"
] ]
pptx = ["python-pptx"] pptx = ["python-pptx"]
docx = ["mammoth"] docx = ["mammoth"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
[project.urls] [project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme" Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -6,6 +6,13 @@ from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
# Mimetypes to ignore (commonly confused extensions)
IGNORE_MIMETYPES = [
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
]
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain""" """Anything with content type text/plain"""
@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
"__placeholder" + kwargs.get("file_extension", "") "__placeholder" + kwargs.get("file_extension", "")
) )
# Ignore common false positives
if content_type in IGNORE_MIMETYPES:
content_type = None
# Only accept text files # Only accept text files
if content_type is None: if content_type is None:
return None return None

View file

@ -1,9 +1,26 @@
from typing import Union import sys
import pandas as pd from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()
_xls_dependency_exc_info = None
try:
import pandas as pd
import xlrd
except ImportError:
_xls_dependency_exc_info = sys.exc_info()
class XlsxConverter(HtmlConverter): class XlsxConverter(HtmlConverter):
@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
if extension.lower() != ".xlsx": if extension.lower() != ".xlsx":
return None return None
# Load the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xlsx",
feature="xlsx",
)
) from _xlsx_dependency_exc_info[1].with_traceback(
_xlsx_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
md_content = "" md_content = ""
for s in sheets: for s in sheets:
@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
if extension.lower() != ".xls": if extension.lower() != ".xls":
return None return None
# Load the dependencies
if _xls_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xls",
feature="xls",
)
) from _xls_dependency_exc_info[1].with_traceback(
_xls_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
md_content = "" md_content = ""
for s in sheets: for s in sheets: