diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 28f73a0..c2bf726 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -81,6 +81,10 @@ class XlsxConverter(DocumentConverter): ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + if kwargs.get("fill_merged_cells", False): + md_content = self._parse_merged_cells(file_stream, sheets, **kwargs) + return DocumentConverterResult(markdown=md_content.strip()) + md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -94,6 +98,63 @@ class XlsxConverter(DocumentConverter): return DocumentConverterResult(markdown=md_content.strip()) + def _parse_merged_cells( + self, file_stream: BinaryIO, sheets: dict[str, pd.DataFrame], **kwargs: Any + ) -> str: + """Use openpyxl to parse merged cells + + Args: + file_stream: BinaryIO + Returns: + str + """ + wb = openpyxl.load_workbook(file_stream) + + merged_cells_info = {} + for sheet in wb.worksheets: + merged_cells = {} + for row in sheet.merged_cells.ranges: + min_col, min_row, max_col, max_row = row.bounds + common_value = sheet.cell(row=min_row, column=min_col).value + for row in range(min_row, max_row + 1): + for col in range(min_col, max_col + 1): + merged_cells[(row, col)] = common_value + # Merged header at first (sort by row) + merged_cells = dict( + sorted(merged_cells.items(), key=lambda x: (x[0][0], x[0][1])) + ) + merged_cells_info[sheet.title] = merged_cells + + wb.close() + + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + df = sheets[s] + for (row, col), value in merged_cells_info[s].items(): + if row == 1: + # Header row merged. + if col > len(df.columns): + # Insert new column + df.insert(col - 1, f"{value} {col-1}", "NaN") + elif str(df.columns[col - 1]).startswith("Unnamed"): + # Rename unnamed column + df.rename( + columns={df.columns[col - 1]: f"{value} {col-1}"}, + inplace=True, + ) + else: + df.at[row - 2, df.columns[col - 1]] = value + html_content = df.to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + + return md_content + class XlsConverter(DocumentConverter): """ diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd..02a6169 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -277,3 +277,22 @@ DATA_URI_TEST_VECTORS = [ ], ), ] + + +MERGED_CELLS_TEST_VECTORS = [ + FileTestVector( + filename="test.xlsx", + mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + charset=None, + url=None, + must_include=[ + "722.0 | NaN | NaN", + "NaN | 42.000000 | NaN", + "Merged Column | Merged Column 6 | Merged Column 2 | Merged Column 2 8", + "## 09060124-b5e7-4717-9d07-3c046eb", + "6ff4173b-42a5-4784-9b19-f49caff4d93d", + "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", + ], + must_not_include=["Unnamed"], + ), +] diff --git a/packages/markitdown/tests/test_files/test.xlsx b/packages/markitdown/tests/test_files/test.xlsx index 3a41e17..8739d80 100644 Binary files a/packages/markitdown/tests/test_files/test.xlsx and b/packages/markitdown/tests/test_files/test.xlsx differ diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py index 98fd0c7..7b68c96 100644 --- a/packages/markitdown/tests/test_module_vectors.py +++ b/packages/markitdown/tests/test_module_vectors.py @@ -8,9 +8,17 @@ import base64 from pathlib import Path if __name__ == "__main__": - from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS + from _test_vectors import ( + GENERAL_TEST_VECTORS, + DATA_URI_TEST_VECTORS, + MERGED_CELLS_TEST_VECTORS, + ) else: - from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS + from ._test_vectors import ( + GENERAL_TEST_VECTORS, + DATA_URI_TEST_VECTORS, + MERGED_CELLS_TEST_VECTORS, + ) from markitdown import ( MarkItDown, @@ -202,6 +210,45 @@ def test_convert_stream_keep_data_uris(test_vector): assert string not in result.markdown +@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS) +def test_convert_xlsx(test_vector): + """Test the conversion of an XLSX file.""" + markitdown = MarkItDown() + + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, test_vector.filename), + fill_merged_cells=True, + url=test_vector.url, + ) + + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS) +def test_convert_stream_xlsx(test_vector): + """Test the conversion of an XLSX file.""" + markitdown = MarkItDown() + + stream_info = StreamInfo( + extension=os.path.splitext(test_vector.filename)[1], + mimetype=test_vector.mimetype, + charset=test_vector.charset, + ) + + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + result = markitdown.convert( + stream, stream_info=stream_info, fill_merged_cells=True, url=test_vector.url + ) + + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + if __name__ == "__main__": import sys @@ -237,3 +284,17 @@ if __name__ == "__main__": print("OK") print("All tests passed!") + + # XLSX parse merged cells tests + for test_function in [ + test_convert_xlsx, + test_convert_stream_xlsx, + ]: + for test_vector in MERGED_CELLS_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", end="" + ) + test_function(test_vector) + print("OK") + + print("All tests passed!")