Added xlsx and xls
This commit is contained in:
parent
e5dc512948
commit
8362df8e60
3 changed files with 60 additions and 6 deletions
|
|
@ -28,9 +28,6 @@ dependencies = [
|
||||||
"requests",
|
"requests",
|
||||||
"markdownify~=0.14.1",
|
"markdownify~=0.14.1",
|
||||||
"numpy",
|
"numpy",
|
||||||
"pandas",
|
|
||||||
"openpyxl",
|
|
||||||
"xlrd",
|
|
||||||
"pdfminer.six",
|
"pdfminer.six",
|
||||||
"puremagic",
|
"puremagic",
|
||||||
"pydub",
|
"pydub",
|
||||||
|
|
@ -47,10 +44,15 @@ dependencies = [
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
all = [
|
all = [
|
||||||
"python-pptx",
|
"python-pptx",
|
||||||
"mammoth"
|
"mammoth",
|
||||||
|
"pandas",
|
||||||
|
"openpyxl",
|
||||||
|
"xlrd"
|
||||||
]
|
]
|
||||||
pptx = ["python-pptx"]
|
pptx = ["python-pptx"]
|
||||||
docx = ["mammoth"]
|
docx = ["mammoth"]
|
||||||
|
xlsx = ["pandas", "openpyxl"]
|
||||||
|
xls = ["pandas", "xlrd"]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,13 @@ from typing import Any, Union
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
|
|
||||||
|
|
||||||
|
# Mimetypes to ignore (commonly confused extensions)
|
||||||
|
IGNORE_MIMETYPES = [
|
||||||
|
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||||
|
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
|
|
@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
|
||||||
"__placeholder" + kwargs.get("file_extension", "")
|
"__placeholder" + kwargs.get("file_extension", "")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Ignore common false positives
|
||||||
|
if content_type in IGNORE_MIMETYPES:
|
||||||
|
content_type = None
|
||||||
|
|
||||||
# Only accept text files
|
# Only accept text files
|
||||||
if content_type is None:
|
if content_type is None:
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,26 @@
|
||||||
from typing import Union
|
import sys
|
||||||
|
|
||||||
import pandas as pd
|
from typing import Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_xlsx_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
import openpyxl
|
||||||
|
except ImportError:
|
||||||
|
_xlsx_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
_xls_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
import xlrd
|
||||||
|
except ImportError:
|
||||||
|
_xls_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(HtmlConverter):
|
class XlsxConverter(HtmlConverter):
|
||||||
|
|
@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
|
||||||
if extension.lower() != ".xlsx":
|
if extension.lower() != ".xlsx":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Load the dependencies
|
||||||
|
if _xlsx_dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
converter=type(self).__name__,
|
||||||
|
extension=".xlsx",
|
||||||
|
feature="xlsx",
|
||||||
|
)
|
||||||
|
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||||
|
_xlsx_dependency_exc_info[2]
|
||||||
|
) # Restore the original traceback
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
|
|
@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
|
||||||
if extension.lower() != ".xls":
|
if extension.lower() != ".xls":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Load the dependencies
|
||||||
|
if _xls_dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
converter=type(self).__name__,
|
||||||
|
extension=".xls",
|
||||||
|
feature="xls",
|
||||||
|
)
|
||||||
|
) from _xls_dependency_exc_info[1].with_traceback(
|
||||||
|
_xls_dependency_exc_info[2]
|
||||||
|
) # Restore the original traceback
|
||||||
|
|
||||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue