Merge a0dc566d39 into 041be54471
This commit is contained in:
commit
0ebadf7463
4 changed files with 143 additions and 2 deletions
|
|
@ -81,6 +81,10 @@ class XlsxConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||||
|
if kwargs.get("fill_merged_cells", False):
|
||||||
|
md_content = self._parse_merged_cells(file_stream, sheets, **kwargs)
|
||||||
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
|
|
@ -94,6 +98,63 @@ class XlsxConverter(DocumentConverter):
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
||||||
|
def _parse_merged_cells(
|
||||||
|
self, file_stream: BinaryIO, sheets: dict[str, pd.DataFrame], **kwargs: Any
|
||||||
|
) -> str:
|
||||||
|
"""Use openpyxl to parse merged cells
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_stream: BinaryIO
|
||||||
|
Returns:
|
||||||
|
str
|
||||||
|
"""
|
||||||
|
wb = openpyxl.load_workbook(file_stream)
|
||||||
|
|
||||||
|
merged_cells_info = {}
|
||||||
|
for sheet in wb.worksheets:
|
||||||
|
merged_cells = {}
|
||||||
|
for row in sheet.merged_cells.ranges:
|
||||||
|
min_col, min_row, max_col, max_row = row.bounds
|
||||||
|
common_value = sheet.cell(row=min_row, column=min_col).value
|
||||||
|
for row in range(min_row, max_row + 1):
|
||||||
|
for col in range(min_col, max_col + 1):
|
||||||
|
merged_cells[(row, col)] = common_value
|
||||||
|
# Merged header at first (sort by row)
|
||||||
|
merged_cells = dict(
|
||||||
|
sorted(merged_cells.items(), key=lambda x: (x[0][0], x[0][1]))
|
||||||
|
)
|
||||||
|
merged_cells_info[sheet.title] = merged_cells
|
||||||
|
|
||||||
|
wb.close()
|
||||||
|
|
||||||
|
md_content = ""
|
||||||
|
for s in sheets:
|
||||||
|
md_content += f"## {s}\n"
|
||||||
|
df = sheets[s]
|
||||||
|
for (row, col), value in merged_cells_info[s].items():
|
||||||
|
if row == 1:
|
||||||
|
# Header row merged.
|
||||||
|
if col > len(df.columns):
|
||||||
|
# Insert new column
|
||||||
|
df.insert(col - 1, f"{value} {col-1}", "NaN")
|
||||||
|
elif str(df.columns[col - 1]).startswith("Unnamed"):
|
||||||
|
# Rename unnamed column
|
||||||
|
df.rename(
|
||||||
|
columns={df.columns[col - 1]: f"{value} {col-1}"},
|
||||||
|
inplace=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
df.at[row - 2, df.columns[col - 1]] = value
|
||||||
|
html_content = df.to_html(index=False)
|
||||||
|
md_content += (
|
||||||
|
self._html_converter.convert_string(
|
||||||
|
html_content, **kwargs
|
||||||
|
).markdown.strip()
|
||||||
|
+ "\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return md_content
|
||||||
|
|
||||||
|
|
||||||
class XlsConverter(DocumentConverter):
|
class XlsConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -277,3 +277,22 @@ DATA_URI_TEST_VECTORS = [
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
MERGED_CELLS_TEST_VECTORS = [
|
||||||
|
FileTestVector(
|
||||||
|
filename="test.xlsx",
|
||||||
|
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
charset=None,
|
||||||
|
url=None,
|
||||||
|
must_include=[
|
||||||
|
"722.0 | NaN | NaN",
|
||||||
|
"NaN | 42.000000 | NaN",
|
||||||
|
"Merged Column | Merged Column 6 | Merged Column 2 | Merged Column 2 8",
|
||||||
|
"## 09060124-b5e7-4717-9d07-3c046eb",
|
||||||
|
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||||
|
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||||
|
],
|
||||||
|
must_not_include=["Unnamed"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
|
||||||
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
Binary file not shown.
|
|
@ -8,9 +8,17 @@ import base64
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
from _test_vectors import (
|
||||||
|
GENERAL_TEST_VECTORS,
|
||||||
|
DATA_URI_TEST_VECTORS,
|
||||||
|
MERGED_CELLS_TEST_VECTORS,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
from ._test_vectors import (
|
||||||
|
GENERAL_TEST_VECTORS,
|
||||||
|
DATA_URI_TEST_VECTORS,
|
||||||
|
MERGED_CELLS_TEST_VECTORS,
|
||||||
|
)
|
||||||
|
|
||||||
from markitdown import (
|
from markitdown import (
|
||||||
MarkItDown,
|
MarkItDown,
|
||||||
|
|
@ -202,6 +210,45 @@ def test_convert_stream_keep_data_uris(test_vector):
|
||||||
assert string not in result.markdown
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS)
|
||||||
|
def test_convert_xlsx(test_vector):
|
||||||
|
"""Test the conversion of an XLSX file."""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
result = markitdown.convert(
|
||||||
|
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
||||||
|
fill_merged_cells=True,
|
||||||
|
url=test_vector.url,
|
||||||
|
)
|
||||||
|
|
||||||
|
for string in test_vector.must_include:
|
||||||
|
assert string in result.markdown
|
||||||
|
for string in test_vector.must_not_include:
|
||||||
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("test_vector", MERGED_CELLS_TEST_VECTORS)
|
||||||
|
def test_convert_stream_xlsx(test_vector):
|
||||||
|
"""Test the conversion of an XLSX file."""
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
|
stream_info = StreamInfo(
|
||||||
|
extension=os.path.splitext(test_vector.filename)[1],
|
||||||
|
mimetype=test_vector.mimetype,
|
||||||
|
charset=test_vector.charset,
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||||
|
result = markitdown.convert(
|
||||||
|
stream, stream_info=stream_info, fill_merged_cells=True, url=test_vector.url
|
||||||
|
)
|
||||||
|
|
||||||
|
for string in test_vector.must_include:
|
||||||
|
assert string in result.markdown
|
||||||
|
for string in test_vector.must_not_include:
|
||||||
|
assert string not in result.markdown
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
@ -237,3 +284,17 @@ if __name__ == "__main__":
|
||||||
print("OK")
|
print("OK")
|
||||||
|
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|
||||||
|
# XLSX parse merged cells tests
|
||||||
|
for test_function in [
|
||||||
|
test_convert_xlsx,
|
||||||
|
test_convert_stream_xlsx,
|
||||||
|
]:
|
||||||
|
for test_vector in MERGED_CELLS_TEST_VECTORS:
|
||||||
|
print(
|
||||||
|
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
||||||
|
)
|
||||||
|
test_function(test_vector)
|
||||||
|
print("OK")
|
||||||
|
|
||||||
|
print("All tests passed!")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue