Merge a0dc566d39 into 041be54471
This commit is contained in:
commit
0ebadf7463
4 changed files with 143 additions and 2 deletions
|
|
@ -81,6 +81,10 @@ class XlsxConverter(DocumentConverter):
|
|||
)
|
||||
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||
if kwargs.get("fill_merged_cells", False):
|
||||
md_content = self._parse_merged_cells(file_stream, sheets, **kwargs)
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
|
|
@ -94,6 +98,63 @@ class XlsxConverter(DocumentConverter):
|
|||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _parse_merged_cells(
|
||||
self, file_stream: BinaryIO, sheets: dict[str, pd.DataFrame], **kwargs: Any
|
||||
) -> str:
|
||||
"""Use openpyxl to parse merged cells
|
||||
|
||||
Args:
|
||||
file_stream: BinaryIO
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
wb = openpyxl.load_workbook(file_stream)
|
||||
|
||||
merged_cells_info = {}
|
||||
for sheet in wb.worksheets:
|
||||
merged_cells = {}
|
||||
for row in sheet.merged_cells.ranges:
|
||||
min_col, min_row, max_col, max_row = row.bounds
|
||||
common_value = sheet.cell(row=min_row, column=min_col).value
|
||||
for row in range(min_row, max_row + 1):
|
||||
for col in range(min_col, max_col + 1):
|
||||
merged_cells[(row, col)] = common_value
|
||||
# Merged header at first (sort by row)
|
||||
merged_cells = dict(
|
||||
sorted(merged_cells.items(), key=lambda x: (x[0][0], x[0][1]))
|
||||
)
|
||||
merged_cells_info[sheet.title] = merged_cells
|
||||
|
||||
wb.close()
|
||||
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
df = sheets[s]
|
||||
for (row, col), value in merged_cells_info[s].items():
|
||||
if row == 1:
|
||||
# Header row merged.
|
||||
if col > len(df.columns):
|
||||
# Insert new column
|
||||
df.insert(col - 1, f"{value} {col-1}", "NaN")
|
||||
elif str(df.columns[col - 1]).startswith("Unnamed"):
|
||||
# Rename unnamed column
|
||||
df.rename(
|
||||
columns={df.columns[col - 1]: f"{value} {col-1}"},
|
||||
inplace=True,
|
||||
)
|
||||
else:
|
||||
df.at[row - 2, df.columns[col - 1]] = value
|
||||
html_content = df.to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return md_content
|
||||
|
||||
|
||||
class XlsConverter(DocumentConverter):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -277,3 +277,22 @@ DATA_URI_TEST_VECTORS = [
|
|||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
MERGED_CELLS_TEST_VECTORS = [
|
||||
FileTestVector(
|
||||
filename="test.xlsx",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
charset=None,
|
||||
url=None,
|
||||
must_include=[
|
||||
"722.0 | NaN | NaN",
|
||||
"NaN | 42.000000 | NaN",
|
||||
"Merged Column | Merged Column 6 | Merged Column 2 | Merged Column 2 8",
|
||||
"## 09060124-b5e7-4717-9d07-3c046eb",
|
||||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||
],
|
||||
must_not_include=["Unnamed"],
|
||||
),
|
||||
]
|
||||
|
|
|
|||
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
Binary file not shown.
|
|
@ -8,9 +8,17 @@ import base64
|
|||
from pathlib import Path
|
||||
|
||||
if __name__ == "__main__":
|
||||
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
||||
from _test_vectors import (
|
||||
GENERAL_TEST_VECTORS,
|
||||
DATA_URI_TEST_VECTORS,
|
||||
MERGED_CELLS_TEST_VECTORS,
|
||||
)
|
||||
else:
|
||||
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
||||
from ._test_vectors import (
|
||||
GENERAL_TEST_VECTORS,
|
||||
DATA_URI_TEST_VECTORS,
|
||||
MERGED_CELLS_TEST_VECTORS,
|
||||
)
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
|
|
@ -202,6 +210,45 @@ def test_convert_stream_keep_data_uris(test_vector):
|
|||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS)
|
||||
def test_convert_xlsx(test_vector):
|
||||
"""Test the conversion of an XLSX file."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
result = markitdown.convert(
|
||||
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||
fill_merged_cells=True,
|
||||
url=test_vector.url,
|
||||
)
|
||||
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS)
|
||||
def test_convert_stream_xlsx(test_vector):
|
||||
"""Test the conversion of an XLSX file."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
stream_info = StreamInfo(
|
||||
extension=os.path.splitext(test_vector.filename)[1],
|
||||
mimetype=test_vector.mimetype,
|
||||
charset=test_vector.charset,
|
||||
)
|
||||
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
result = markitdown.convert(
|
||||
stream, stream_info=stream_info, fill_merged_cells=True, url=test_vector.url
|
||||
)
|
||||
|
||||
for string in test_vector.must_include:
|
||||
assert string in result.markdown
|
||||
for string in test_vector.must_not_include:
|
||||
assert string not in result.markdown
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
|
@ -237,3 +284,17 @@ if __name__ == "__main__":
|
|||
print("OK")
|
||||
|
||||
print("All tests passed!")
|
||||
|
||||
# XLSX parse merged cells tests
|
||||
for test_function in [
|
||||
test_convert_xlsx,
|
||||
test_convert_stream_xlsx,
|
||||
]:
|
||||
for test_vector in MERGED_CELLS_TEST_VECTORS:
|
||||
print(
|
||||
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
||||
)
|
||||
test_function(test_vector)
|
||||
print("OK")
|
||||
|
||||
print("All tests passed!")
|
||||
|
|
|
|||
Loading…
Reference in a new issue