diff --git a/packages/markitup/pyproject.toml b/packages/markitup/pyproject.toml index cc2034f..6c132ba 100644 --- a/packages/markitup/pyproject.toml +++ b/packages/markitup/pyproject.toml @@ -32,7 +32,6 @@ dependencies = [ "python-pptx", "mammoth", "pandas", - "openpyxl", "xlrd", "lxml", "olefile", diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index 2cfb67a..ee1abf1 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -58,6 +58,10 @@ class MarkItUp: return PdfConverter(config=self.config).convert(stream, stream_info), stream_info case "audio": return AudioConverter(config=self.config).convert(stream, stream_info), stream_info + case "xlsx": + return XlsxConverter().convert(stream, stream_info), stream_info + case "xls": + return XlsConverter().convert(stream, stream_info), stream_info except FailedConversionAttempt: raise FileConversionException( f"Failed to convert file of type {stream_info.magic_type}") diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py index 3c2cda5..e67c9b5 100644 --- a/packages/markitup/src/markitup/converter_utils/utils.py +++ b/packages/markitup/src/markitup/converter_utils/utils.py @@ -75,21 +75,18 @@ def detect_file_types(file_dict): category = "audio" elif magic_type.startswith("video/"): category = "video" - elif ( - magic_type.startswith("application/vnd.ms-excel") - or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ): - category = "xls" - elif ( - magic_type.startswith("application/vnd.ms-powerpoint") - or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ): - category = "ppt" - elif ( - magic_type.startswith("application/msword") - or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ): - category = "doc" + elif magic_type.startswith("application/vnd.ms-excel"): + category = 'xls' + elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"): + category = "xlsx" + elif magic_type.startswith("application/vnd.ms-powerpoint"): + category = 'ppt' + elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": + category = "pptx" + elif magic_type.startswith("application/msword"): + category = 'doc' + elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": + category = "docx" elif magic_type == "application/pdf": category = "pdf" elif magic_type.startswith("text/"): @@ -97,11 +94,8 @@ def detect_file_types(file_dict): else: category = "other" - # Store the results - result[filename] = StreamInfo(magic_type=magic_type, category=category) - - # Reset stream position byte_stream.seek(original_position) + result[filename] = StreamInfo(magic_type=magic_type, category=category) return result diff --git a/packages/markitup/src/markitup/converters/_xlsx_converter.py b/packages/markitup/src/markitup/converters/_xlsx_converter.py index 8769fe0..f7e9879 100644 --- a/packages/markitup/src/markitup/converters/_xlsx_converter.py +++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py @@ -1,36 +1,8 @@ -import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._schemas import StreamInfo - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later -_xlsx_dependency_exc_info = None -try: - import pandas as pd - import openpyxl -except ImportError: - _xlsx_dependency_exc_info = sys.exc_info() - -_xls_dependency_exc_info = None -try: - import pandas as pd - import xlrd -except ImportError: - _xls_dependency_exc_info = sys.exc_info() - -ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [ - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" -] -ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"] - -ACCEPTED_XLS_MIME_TYPE_PREFIXES = [ - "application/vnd.ms-excel", - "application/excel", -] -ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +import pandas as pd class XlsxConverter(DocumentConverter): @@ -39,47 +11,14 @@ class XlsxConverter(DocumentConverter): """ def __init__(self): - super().__init__() self._html_converter = HtmlConverter() - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_XLSX_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False - def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - # Check the dependencies - if _xlsx_dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".xlsx", - feature="xlsx", - ) - ) from _xlsx_dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _xlsx_dependency_exc_info[2] - ) - sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") md_content = "" for s in sheets: @@ -104,43 +43,12 @@ class XlsConverter(DocumentConverter): super().__init__() self._html_converter = HtmlConverter() - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_XLS_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False - def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - # Load the dependencies - if _xls_dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".xls", - feature="xls", - ) - ) from _xls_dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _xls_dependency_exc_info[2] - ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") md_content = "" diff --git a/packages/markitup/uv.lock b/packages/markitup/uv.lock index 0ed9063..fa5560b 100644 --- a/packages/markitup/uv.lock +++ b/packages/markitup/uv.lock @@ -173,15 +173,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 }, ] -[[package]] -name = "et-xmlfile" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 }, -] - [[package]] name = "flatbuffers" version = "25.2.10" @@ -348,7 +339,6 @@ dependencies = [ { name = "mammoth" }, { name = "markdownify" }, { name = "olefile" }, - { name = "openpyxl" }, { name = "pandas" }, { name = "pydub" }, { name = "pymupdf" }, @@ -368,7 +358,6 @@ requires-dist = [ { name = "mammoth" }, { name = "markdownify" }, { name = "olefile" }, - { name = "openpyxl" }, { name = "pandas" }, { name = "pydub" }, { name = "pymupdf", specifier = ">=1.25.5" }, @@ -492,18 +481,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 }, ] -[[package]] -name = "openpyxl" -version = "3.1.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "et-xmlfile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 }, -] - [[package]] name = "packaging" version = "25.0"