Added xlsx and xls

This commit is contained in:
Adam Fourney 2025-02-28 21:21:17 -08:00
parent e5dc512948
commit 8362df8e60
3 changed files with 60 additions and 6 deletions

View file

@ -28,9 +28,6 @@ dependencies = [
"requests",
"markdownify~=0.14.1",
"numpy",
"pandas",
"openpyxl",
"xlrd",
"pdfminer.six",
"puremagic",
"pydub",
@ -47,10 +44,15 @@ dependencies = [
[project.optional-dependencies]
all = [
"python-pptx",
"mammoth"
"mammoth",
"pandas",
"openpyxl",
"xlrd"
]
pptx = ["python-pptx"]
docx = ["mammoth"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"

View file

@ -6,6 +6,13 @@ from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
# Mimetypes to ignore (commonly confused extensions)
IGNORE_MIMETYPES = [
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
]
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
"__placeholder" + kwargs.get("file_extension", "")
)
# Ignore common false positives
if content_type in IGNORE_MIMETYPES:
content_type = None
# Only accept text files
if content_type is None:
return None

View file

@ -1,9 +1,26 @@
from typing import Union
import sys
import pandas as pd
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()
_xls_dependency_exc_info = None
try:
import pandas as pd
import xlrd
except ImportError:
_xls_dependency_exc_info = sys.exc_info()
class XlsxConverter(HtmlConverter):
@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
if extension.lower() != ".xlsx":
return None
# Load the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xlsx",
feature="xlsx",
)
) from _xlsx_dependency_exc_info[1].with_traceback(
_xlsx_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
md_content = ""
for s in sheets:
@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
if extension.lower() != ".xls":
return None
# Load the dependencies
if _xls_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xls",
feature="xls",
)
) from _xls_dependency_exc_info[1].with_traceback(
_xls_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
md_content = ""
for s in sheets: