Added xlsx and xls
This commit is contained in:
parent
e5dc512948
commit
8362df8e60
3 changed files with 60 additions and 6 deletions
|
|
@ -28,9 +28,6 @@ dependencies = [
|
|||
"requests",
|
||||
"markdownify~=0.14.1",
|
||||
"numpy",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"pdfminer.six",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
|
|
@ -47,10 +44,15 @@ dependencies = [
|
|||
[project.optional-dependencies]
|
||||
all = [
|
||||
"python-pptx",
|
||||
"mammoth"
|
||||
"mammoth",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd"
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
|
|
|
|||
|
|
@ -6,6 +6,13 @@ from typing import Any, Union
|
|||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
# Mimetypes to ignore (commonly confused extensions)
|
||||
IGNORE_MIMETYPES = [
|
||||
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||
]
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
|
|
@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
|
|||
"__placeholder" + kwargs.get("file_extension", "")
|
||||
)
|
||||
|
||||
# Ignore common false positives
|
||||
if content_type in IGNORE_MIMETYPES:
|
||||
content_type = None
|
||||
|
||||
# Only accept text files
|
||||
if content_type is None:
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -1,9 +1,26 @@
|
|||
from typing import Union
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_xlsx_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import openpyxl
|
||||
except ImportError:
|
||||
_xlsx_dependency_exc_info = sys.exc_info()
|
||||
|
||||
_xls_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import xlrd
|
||||
except ImportError:
|
||||
_xls_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class XlsxConverter(HtmlConverter):
|
||||
|
|
@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
|
|||
if extension.lower() != ".xlsx":
|
||||
return None
|
||||
|
||||
# Load the dependencies
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||
_xlsx_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
|
|
@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
|
|||
if extension.lower() != ".xls":
|
||||
return None
|
||||
|
||||
# Load the dependencies
|
||||
if _xls_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xls",
|
||||
feature="xls",
|
||||
)
|
||||
) from _xls_dependency_exc_info[1].with_traceback(
|
||||
_xls_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
|
|
|
|||
Loading…
Reference in a new issue