This commit is contained in:
Yuzhong Zhang 2025-04-18 22:55:21 -07:00 committed by GitHub
commit 0ebadf7463
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 143 additions and 2 deletions

View file

@ -81,6 +81,10 @@ class XlsxConverter(DocumentConverter):
) )
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
if kwargs.get("fill_merged_cells", False):
md_content = self._parse_merged_cells(file_stream, sheets, **kwargs)
return DocumentConverterResult(markdown=md_content.strip())
md_content = "" md_content = ""
for s in sheets: for s in sheets:
md_content += f"## {s}\n" md_content += f"## {s}\n"
@ -94,6 +98,63 @@ class XlsxConverter(DocumentConverter):
return DocumentConverterResult(markdown=md_content.strip()) return DocumentConverterResult(markdown=md_content.strip())
def _parse_merged_cells(
self, file_stream: BinaryIO, sheets: dict[str, pd.DataFrame], **kwargs: Any
) -> str:
"""Use openpyxl to parse merged cells
Args:
file_stream: BinaryIO
Returns:
str
"""
wb = openpyxl.load_workbook(file_stream)
merged_cells_info = {}
for sheet in wb.worksheets:
merged_cells = {}
for row in sheet.merged_cells.ranges:
min_col, min_row, max_col, max_row = row.bounds
common_value = sheet.cell(row=min_row, column=min_col).value
for row in range(min_row, max_row + 1):
for col in range(min_col, max_col + 1):
merged_cells[(row, col)] = common_value
# Merged header at first (sort by row)
merged_cells = dict(
sorted(merged_cells.items(), key=lambda x: (x[0][0], x[0][1]))
)
merged_cells_info[sheet.title] = merged_cells
wb.close()
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
df = sheets[s]
for (row, col), value in merged_cells_info[s].items():
if row == 1:
# Header row merged.
if col > len(df.columns):
# Insert new column
df.insert(col - 1, f"{value} {col-1}", "NaN")
elif str(df.columns[col - 1]).startswith("Unnamed"):
# Rename unnamed column
df.rename(
columns={df.columns[col - 1]: f"{value} {col-1}"},
inplace=True,
)
else:
df.at[row - 2, df.columns[col - 1]] = value
html_content = df.to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
return md_content
class XlsConverter(DocumentConverter): class XlsConverter(DocumentConverter):
""" """

View file

@ -277,3 +277,22 @@ DATA_URI_TEST_VECTORS = [
], ],
), ),
] ]
MERGED_CELLS_TEST_VECTORS = [
FileTestVector(
filename="test.xlsx",
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
charset=None,
url=None,
must_include=[
"722.0 | NaN | NaN",
"NaN | 42.000000 | NaN",
"Merged Column | Merged Column 6 | Merged Column 2 | Merged Column 2 8",
"## 09060124-b5e7-4717-9d07-3c046eb",
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
],
must_not_include=["Unnamed"],
),
]

Binary file not shown.

View file

@ -8,9 +8,17 @@ import base64
from pathlib import Path from pathlib import Path
if __name__ == "__main__": if __name__ == "__main__":
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS from _test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
MERGED_CELLS_TEST_VECTORS,
)
else: else:
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS from ._test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
MERGED_CELLS_TEST_VECTORS,
)
from markitdown import ( from markitdown import (
MarkItDown, MarkItDown,
@ -202,6 +210,45 @@ def test_convert_stream_keep_data_uris(test_vector):
assert string not in result.markdown assert string not in result.markdown
@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS)
def test_convert_xlsx(test_vector):
"""Test the conversion of an XLSX file."""
markitdown = MarkItDown()
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, test_vector.filename),
fill_merged_cells=True,
url=test_vector.url,
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS)
def test_convert_stream_xlsx(test_vector):
"""Test the conversion of an XLSX file."""
markitdown = MarkItDown()
stream_info = StreamInfo(
extension=os.path.splitext(test_vector.filename)[1],
mimetype=test_vector.mimetype,
charset=test_vector.charset,
)
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(
stream, stream_info=stream_info, fill_merged_cells=True, url=test_vector.url
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
@ -237,3 +284,17 @@ if __name__ == "__main__":
print("OK") print("OK")
print("All tests passed!") print("All tests passed!")
# XLSX parse merged cells tests
for test_function in [
test_convert_xlsx,
test_convert_stream_xlsx,
]:
for test_vector in MERGED_CELLS_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
)
test_function(test_vector)
print("OK")
print("All tests passed!")