support xlsx and xls
This commit is contained in:
parent
f33a0ed922
commit
c47cd0deec
5 changed files with 18 additions and 136 deletions
|
|
@ -32,7 +32,6 @@ dependencies = [
|
||||||
"python-pptx",
|
"python-pptx",
|
||||||
"mammoth",
|
"mammoth",
|
||||||
"pandas",
|
"pandas",
|
||||||
"openpyxl",
|
|
||||||
"xlrd",
|
"xlrd",
|
||||||
"lxml",
|
"lxml",
|
||||||
"olefile",
|
"olefile",
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,10 @@ class MarkItUp:
|
||||||
return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
|
return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
case "audio":
|
case "audio":
|
||||||
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
|
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case "xlsx":
|
||||||
|
return XlsxConverter().convert(stream, stream_info), stream_info
|
||||||
|
case "xls":
|
||||||
|
return XlsConverter().convert(stream, stream_info), stream_info
|
||||||
except FailedConversionAttempt:
|
except FailedConversionAttempt:
|
||||||
raise FileConversionException(
|
raise FileConversionException(
|
||||||
f"Failed to convert file of type {stream_info.magic_type}")
|
f"Failed to convert file of type {stream_info.magic_type}")
|
||||||
|
|
|
||||||
|
|
@ -75,21 +75,18 @@ def detect_file_types(file_dict):
|
||||||
category = "audio"
|
category = "audio"
|
||||||
elif magic_type.startswith("video/"):
|
elif magic_type.startswith("video/"):
|
||||||
category = "video"
|
category = "video"
|
||||||
elif (
|
elif magic_type.startswith("application/vnd.ms-excel"):
|
||||||
magic_type.startswith("application/vnd.ms-excel")
|
category = 'xls'
|
||||||
or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
|
||||||
):
|
category = "xlsx"
|
||||||
category = "xls"
|
elif magic_type.startswith("application/vnd.ms-powerpoint"):
|
||||||
elif (
|
category = 'ppt'
|
||||||
magic_type.startswith("application/vnd.ms-powerpoint")
|
elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||||
or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
category = "pptx"
|
||||||
):
|
elif magic_type.startswith("application/msword"):
|
||||||
category = "ppt"
|
category = 'doc'
|
||||||
elif (
|
elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
||||||
magic_type.startswith("application/msword")
|
category = "docx"
|
||||||
or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
):
|
|
||||||
category = "doc"
|
|
||||||
elif magic_type == "application/pdf":
|
elif magic_type == "application/pdf":
|
||||||
category = "pdf"
|
category = "pdf"
|
||||||
elif magic_type.startswith("text/"):
|
elif magic_type.startswith("text/"):
|
||||||
|
|
@ -97,11 +94,8 @@ def detect_file_types(file_dict):
|
||||||
else:
|
else:
|
||||||
category = "other"
|
category = "other"
|
||||||
|
|
||||||
# Store the results
|
|
||||||
result[filename] = StreamInfo(magic_type=magic_type, category=category)
|
|
||||||
|
|
||||||
# Reset stream position
|
|
||||||
byte_stream.seek(original_position)
|
byte_stream.seek(original_position)
|
||||||
|
result[filename] = StreamInfo(magic_type=magic_type, category=category)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,36 +1,8 @@
|
||||||
import sys
|
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
||||||
from .._schemas import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
|
import pandas as pd
|
||||||
# Try loading optional (but in this case, required) dependencies
|
|
||||||
# Save reporting of any exceptions for later
|
|
||||||
_xlsx_dependency_exc_info = None
|
|
||||||
try:
|
|
||||||
import pandas as pd
|
|
||||||
import openpyxl
|
|
||||||
except ImportError:
|
|
||||||
_xlsx_dependency_exc_info = sys.exc_info()
|
|
||||||
|
|
||||||
_xls_dependency_exc_info = None
|
|
||||||
try:
|
|
||||||
import pandas as pd
|
|
||||||
import xlrd
|
|
||||||
except ImportError:
|
|
||||||
_xls_dependency_exc_info = sys.exc_info()
|
|
||||||
|
|
||||||
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
|
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
||||||
]
|
|
||||||
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
|
|
||||||
|
|
||||||
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
|
||||||
"application/vnd.ms-excel",
|
|
||||||
"application/excel",
|
|
||||||
]
|
|
||||||
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(DocumentConverter):
|
class XlsxConverter(DocumentConverter):
|
||||||
|
|
@ -39,47 +11,14 @@ class XlsxConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def accepts(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Check the dependencies
|
|
||||||
if _xlsx_dependency_exc_info is not None:
|
|
||||||
raise MissingDependencyException(
|
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
|
||||||
converter=type(self).__name__,
|
|
||||||
extension=".xlsx",
|
|
||||||
feature="xlsx",
|
|
||||||
)
|
|
||||||
) from _xlsx_dependency_exc_info[
|
|
||||||
1
|
|
||||||
].with_traceback( # type: ignore[union-attr]
|
|
||||||
_xlsx_dependency_exc_info[2]
|
|
||||||
)
|
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
|
|
@ -104,43 +43,12 @@ class XlsConverter(DocumentConverter):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
def accepts(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Load the dependencies
|
|
||||||
if _xls_dependency_exc_info is not None:
|
|
||||||
raise MissingDependencyException(
|
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
|
||||||
converter=type(self).__name__,
|
|
||||||
extension=".xls",
|
|
||||||
feature="xls",
|
|
||||||
)
|
|
||||||
) from _xls_dependency_exc_info[
|
|
||||||
1
|
|
||||||
].with_traceback( # type: ignore[union-attr]
|
|
||||||
_xls_dependency_exc_info[2]
|
|
||||||
)
|
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
|
||||||
|
|
@ -173,15 +173,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
|
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "et-xmlfile"
|
|
||||||
version = "2.0.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flatbuffers"
|
name = "flatbuffers"
|
||||||
version = "25.2.10"
|
version = "25.2.10"
|
||||||
|
|
@ -348,7 +339,6 @@ dependencies = [
|
||||||
{ name = "mammoth" },
|
{ name = "mammoth" },
|
||||||
{ name = "markdownify" },
|
{ name = "markdownify" },
|
||||||
{ name = "olefile" },
|
{ name = "olefile" },
|
||||||
{ name = "openpyxl" },
|
|
||||||
{ name = "pandas" },
|
{ name = "pandas" },
|
||||||
{ name = "pydub" },
|
{ name = "pydub" },
|
||||||
{ name = "pymupdf" },
|
{ name = "pymupdf" },
|
||||||
|
|
@ -368,7 +358,6 @@ requires-dist = [
|
||||||
{ name = "mammoth" },
|
{ name = "mammoth" },
|
||||||
{ name = "markdownify" },
|
{ name = "markdownify" },
|
||||||
{ name = "olefile" },
|
{ name = "olefile" },
|
||||||
{ name = "openpyxl" },
|
|
||||||
{ name = "pandas" },
|
{ name = "pandas" },
|
||||||
{ name = "pydub" },
|
{ name = "pydub" },
|
||||||
{ name = "pymupdf", specifier = ">=1.25.5" },
|
{ name = "pymupdf", specifier = ">=1.25.5" },
|
||||||
|
|
@ -492,18 +481,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
|
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "openpyxl"
|
|
||||||
version = "3.1.5"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "et-xmlfile" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "packaging"
|
name = "packaging"
|
||||||
version = "25.0"
|
version = "25.0"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue