diff --git a/pyproject.toml b/pyproject.toml index 3e14cec..f661e52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,9 +29,9 @@ dependencies = [ "mammoth", "markdownify", "numpy", + "python-calamine", "python-pptx", "pandas", - "openpyxl", "pdfminer.six", "puremagic", "pydub", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..b0a6c75 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -713,18 +713,18 @@ class DocxConverter(HtmlConverter): return result -class XlsxConverter(HtmlConverter): +class ExcelConverter(HtmlConverter): """ - Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + Converts excel files to Markdown, with each sheet presented as a separate Markdown table. """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") - if extension.lower() != ".xlsx": + if extension.lower() not in [".xlsx", ".xls", ".xlsm", ".xlsb"]: return None - sheets = pd.read_excel(local_path, sheet_name=None) + sheets = pd.read_excel(local_path, sheet_name=None, engine="calamine") md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -1277,7 +1277,7 @@ class MarkItDown: self.register_page_converter(YouTubeConverter()) self.register_page_converter(BingSerpConverter()) self.register_page_converter(DocxConverter()) - self.register_page_converter(XlsxConverter()) + self.register_page_converter(ExcelConverter()) self.register_page_converter(PptxConverter()) self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) diff --git a/tests/test_files/test.xls b/tests/test_files/test.xls new file mode 100644 index 0000000..f45bc98 Binary files /dev/null and b/tests/test_files/test.xls differ diff --git a/tests/test_files/test.xlsb b/tests/test_files/test.xlsb new file mode 100644 index 0000000..c002872 Binary files /dev/null and b/tests/test_files/test.xlsb differ diff --git a/tests/test_files/test.xlsm b/tests/test_files/test.xlsm new file mode 100644 index 0000000..01fff3d Binary files /dev/null and b/tests/test_files/test.xlsm differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..c6f1406 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -48,7 +48,7 @@ YOUTUBE_TEST_STRINGS = [ "the model we're going to be using today is GPT 3.5 turbo", # From the transcript ] -XLSX_TEST_STRINGS = [ +EXCEL_TEST_STRINGS = [ "## 09060124-b5e7-4717-9d07-3c046eb", "6ff4173b-42a5-4784-9b19-f49caff4d93d", "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", @@ -174,7 +174,19 @@ def test_markitdown_local() -> None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) - validate_strings(result, XLSX_TEST_STRINGS) + validate_strings(result, EXCEL_TEST_STRINGS) + + # Test XLS processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls")) + validate_strings(result, EXCEL_TEST_STRINGS) + + # Test XLSM processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsm")) + validate_strings(result, EXCEL_TEST_STRINGS) + + # Test XLSB processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsb")) + validate_strings(result, EXCEL_TEST_STRINGS) # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) @@ -206,7 +218,7 @@ def test_markitdown_local() -> None: # Test ZIP file processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) - validate_strings(result, XLSX_TEST_STRINGS) + validate_strings(result, EXCEL_TEST_STRINGS) # Test Wikipedia processing result = markitdown.convert(