chore: finer flags, forward na_rep

This commit is contained in:
Hew Li Yang 2024-12-17 21:17:40 +08:00
parent c2aae4ddda
commit 5c60d8ca12
2 changed files with 22 additions and 22 deletions

View file

@ -523,19 +523,18 @@ class XlsxConverter(HtmlConverter):
"""
def _clean_colname(self, colname: Any) -> Any:
# Remove Pandas header placeholders
if isinstance(colname, str) and colname.startswith("Unnamed:"):
return ""
return colname
def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
return (
df.rename(columns=lambda col: self._clean_colname(col))
.dropna(how="all", axis=1)
.dropna(how="all", axis=0)
)
def convert(
self, local_path, beautify: bool = True, **kwargs
self,
local_path,
na_rep: Any = "",
drop_empty_cols: bool = False,
drop_empty_rows: bool = False,
**kwargs,
) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
@ -546,12 +545,15 @@ class XlsxConverter(HtmlConverter):
md_content = ""
for name, sheet in sheets.items():
md_content += f"## {name}\n"
df = self._clean_dataframe(sheet) if beautify else sheet
html_content = (
df.to_html(index=False, na_rep="")
if beautify
else df.to_html(index=False)
)
sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
if drop_empty_cols:
sheet = sheet.dropna(axis=1, how="all")
if drop_empty_rows:
sheet = sheet.dropna(axis=0, how="all")
html_content = sheet.to_html(index=False, na_rep=na_rep)
md_content += self._convert(html_content).text_content.strip() + "\n\n"
return DocumentConverterResult(

View file

@ -140,18 +140,16 @@ def test_markitdown_local() -> None:
markitdown = MarkItDown()
# Test XLSX processing
# XlsxConverter has an additional kwarg `beautify`, which defaults to True
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test.xlsx"), beautify=False
)
result_cleaned = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
text_content = result.text_content.replace("\\", "")
# Check assertions
for test_string in XLSX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Check negations
assert "Unnamed:" not in result_cleaned.text_content
assert "NaN" not in result_cleaned.text_content
assert "Unnamed:" not in result.text_content
assert "NaN" not in result.text_content
# Test DOCX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))