support xlsx and xls

This commit is contained in:
rong-xyz 2025-04-22 10:17:22 +00:00
parent f33a0ed922
commit c47cd0deec
5 changed files with 18 additions and 136 deletions

View file

@ -32,7 +32,6 @@ dependencies = [
"python-pptx", "python-pptx",
"mammoth", "mammoth",
"pandas", "pandas",
"openpyxl",
"xlrd", "xlrd",
"lxml", "lxml",
"olefile", "olefile",

View file

@ -58,6 +58,10 @@ class MarkItUp:
return PdfConverter(config=self.config).convert(stream, stream_info), stream_info return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
case "audio": case "audio":
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
case "xlsx":
return XlsxConverter().convert(stream, stream_info), stream_info
case "xls":
return XlsConverter().convert(stream, stream_info), stream_info
except FailedConversionAttempt: except FailedConversionAttempt:
raise FileConversionException( raise FileConversionException(
f"Failed to convert file of type {stream_info.magic_type}") f"Failed to convert file of type {stream_info.magic_type}")

View file

@ -75,21 +75,18 @@ def detect_file_types(file_dict):
category = "audio" category = "audio"
elif magic_type.startswith("video/"): elif magic_type.startswith("video/"):
category = "video" category = "video"
elif ( elif magic_type.startswith("application/vnd.ms-excel"):
magic_type.startswith("application/vnd.ms-excel") category = 'xls'
or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
): category = "xlsx"
category = "xls" elif magic_type.startswith("application/vnd.ms-powerpoint"):
elif ( category = 'ppt'
magic_type.startswith("application/vnd.ms-powerpoint") elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" category = "pptx"
): elif magic_type.startswith("application/msword"):
category = "ppt" category = 'doc'
elif ( elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
magic_type.startswith("application/msword") category = "docx"
or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
category = "doc"
elif magic_type == "application/pdf": elif magic_type == "application/pdf":
category = "pdf" category = "pdf"
elif magic_type.startswith("text/"): elif magic_type.startswith("text/"):
@ -97,11 +94,8 @@ def detect_file_types(file_dict):
else: else:
category = "other" category = "other"
# Store the results
result[filename] = StreamInfo(magic_type=magic_type, category=category)
# Reset stream position
byte_stream.seek(original_position) byte_stream.seek(original_position)
result[filename] = StreamInfo(magic_type=magic_type, category=category)
return result return result

View file

@ -1,36 +1,8 @@
import sys
from typing import BinaryIO, Any from typing import BinaryIO, Any
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._schemas import StreamInfo from .._schemas import StreamInfo
import pandas as pd
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()
_xls_dependency_exc_info = None
try:
import pandas as pd
import xlrd
except ImportError:
_xls_dependency_exc_info = sys.exc_info()
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
"application/vnd.ms-excel",
"application/excel",
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
class XlsxConverter(DocumentConverter): class XlsxConverter(DocumentConverter):
@ -39,47 +11,14 @@ class XlsxConverter(DocumentConverter):
""" """
def __init__(self): def __init__(self):
super().__init__()
self._html_converter = HtmlConverter() self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Check the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xlsx",
feature="xlsx",
)
) from _xlsx_dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_xlsx_dependency_exc_info[2]
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = "" md_content = ""
for s in sheets: for s in sheets:
@ -104,43 +43,12 @@ class XlsConverter(DocumentConverter):
super().__init__() super().__init__()
self._html_converter = HtmlConverter() self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Load the dependencies
if _xls_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xls",
feature="xls",
)
) from _xls_dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_xls_dependency_exc_info[2]
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
md_content = "" md_content = ""

View file

@ -173,15 +173,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 }, { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
] ]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 },
]
[[package]] [[package]]
name = "flatbuffers" name = "flatbuffers"
version = "25.2.10" version = "25.2.10"
@ -348,7 +339,6 @@ dependencies = [
{ name = "mammoth" }, { name = "mammoth" },
{ name = "markdownify" }, { name = "markdownify" },
{ name = "olefile" }, { name = "olefile" },
{ name = "openpyxl" },
{ name = "pandas" }, { name = "pandas" },
{ name = "pydub" }, { name = "pydub" },
{ name = "pymupdf" }, { name = "pymupdf" },
@ -368,7 +358,6 @@ requires-dist = [
{ name = "mammoth" }, { name = "mammoth" },
{ name = "markdownify" }, { name = "markdownify" },
{ name = "olefile" }, { name = "olefile" },
{ name = "openpyxl" },
{ name = "pandas" }, { name = "pandas" },
{ name = "pydub" }, { name = "pydub" },
{ name = "pymupdf", specifier = ">=1.25.5" }, { name = "pymupdf", specifier = ">=1.25.5" },
@ -492,18 +481,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 }, { url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
] ]
[[package]]
name = "openpyxl"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "et-xmlfile" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 },
]
[[package]] [[package]]
name = "packaging" name = "packaging"
version = "25.0" version = "25.0"