support xlsx and xls
This commit is contained in:
parent
f33a0ed922
commit
c47cd0deec
5 changed files with 18 additions and 136 deletions
|
|
@ -32,7 +32,6 @@ dependencies = [
|
|||
"python-pptx",
|
||||
"mammoth",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"lxml",
|
||||
"olefile",
|
||||
|
|
|
|||
|
|
@ -58,6 +58,10 @@ class MarkItUp:
|
|||
return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "audio":
|
||||
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||
case "xlsx":
|
||||
return XlsxConverter().convert(stream, stream_info), stream_info
|
||||
case "xls":
|
||||
return XlsConverter().convert(stream, stream_info), stream_info
|
||||
except FailedConversionAttempt:
|
||||
raise FileConversionException(
|
||||
f"Failed to convert file of type {stream_info.magic_type}")
|
||||
|
|
|
|||
|
|
@ -75,21 +75,18 @@ def detect_file_types(file_dict):
|
|||
category = "audio"
|
||||
elif magic_type.startswith("video/"):
|
||||
category = "video"
|
||||
elif (
|
||||
magic_type.startswith("application/vnd.ms-excel")
|
||||
or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
):
|
||||
category = "xls"
|
||||
elif (
|
||||
magic_type.startswith("application/vnd.ms-powerpoint")
|
||||
or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
):
|
||||
category = "ppt"
|
||||
elif (
|
||||
magic_type.startswith("application/msword")
|
||||
or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
category = "doc"
|
||||
elif magic_type.startswith("application/vnd.ms-excel"):
|
||||
category = 'xls'
|
||||
elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
|
||||
category = "xlsx"
|
||||
elif magic_type.startswith("application/vnd.ms-powerpoint"):
|
||||
category = 'ppt'
|
||||
elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||
category = "pptx"
|
||||
elif magic_type.startswith("application/msword"):
|
||||
category = 'doc'
|
||||
elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
||||
category = "docx"
|
||||
elif magic_type == "application/pdf":
|
||||
category = "pdf"
|
||||
elif magic_type.startswith("text/"):
|
||||
|
|
@ -97,11 +94,8 @@ def detect_file_types(file_dict):
|
|||
else:
|
||||
category = "other"
|
||||
|
||||
# Store the results
|
||||
result[filename] = StreamInfo(magic_type=magic_type, category=category)
|
||||
|
||||
# Reset stream position
|
||||
byte_stream.seek(original_position)
|
||||
result[filename] = StreamInfo(magic_type=magic_type, category=category)
|
||||
|
||||
return result
|
||||
|
||||
|
|
|
|||
|
|
@ -1,36 +1,8 @@
|
|||
import sys
|
||||
from typing import BinaryIO, Any
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
from .._schemas import StreamInfo
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_xlsx_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import openpyxl
|
||||
except ImportError:
|
||||
_xlsx_dependency_exc_info = sys.exc_info()
|
||||
|
||||
_xls_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import xlrd
|
||||
except ImportError:
|
||||
_xls_dependency_exc_info = sys.exc_info()
|
||||
|
||||
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
]
|
||||
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
|
||||
|
||||
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.ms-excel",
|
||||
"application/excel",
|
||||
]
|
||||
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
||||
|
||||
|
||||
class XlsxConverter(DocumentConverter):
|
||||
|
|
@ -39,47 +11,14 @@ class XlsxConverter(DocumentConverter):
|
|||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_xlsx_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
|
|
@ -104,43 +43,12 @@ class XlsConverter(DocumentConverter):
|
|||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Load the dependencies
|
||||
if _xls_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xls",
|
||||
feature="xls",
|
||||
)
|
||||
) from _xls_dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_xls_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
|
|
|
|||
|
|
@ -173,15 +173,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flatbuffers"
|
||||
version = "25.2.10"
|
||||
|
|
@ -348,7 +339,6 @@ dependencies = [
|
|||
{ name = "mammoth" },
|
||||
{ name = "markdownify" },
|
||||
{ name = "olefile" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pydub" },
|
||||
{ name = "pymupdf" },
|
||||
|
|
@ -368,7 +358,6 @@ requires-dist = [
|
|||
{ name = "mammoth" },
|
||||
{ name = "markdownify" },
|
||||
{ name = "olefile" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pydub" },
|
||||
{ name = "pymupdf", specifier = ">=1.25.5" },
|
||||
|
|
@ -492,18 +481,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "et-xmlfile" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "25.0"
|
||||
|
|
|
|||
Loading…
Reference in a new issue