diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 67f31af..a576196 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -525,7 +525,7 @@ class XlsxConverter(HtmlConverter): def _clean_colname(self, colname: Any) -> Any: # Remove Pandas header placeholders if isinstance(colname, str) and colname.startswith("Unnamed:"): - return "" + return None return colname def convert( @@ -548,11 +548,16 @@ class XlsxConverter(HtmlConverter): sheet = sheet.rename(columns=lambda col: self._clean_colname(col)) if drop_empty_cols: - sheet = sheet.dropna(axis=1, how="all") + # also consider headers to be part of the column + sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()] if drop_empty_rows: sheet = sheet.dropna(axis=0, how="all") + # convert remaining NaN's to empty string + # because .to_html(na_rep="") does not apply to headers + sheet.columns = sheet.columns.fillna(na_rep) + html_content = sheet.to_html(index=False, na_rep=na_rep) md_content += self._convert(html_content).text_content.strip() + "\n\n" diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 0dcbeb9..9153d52 100755 Binary files a/tests/test_files/test.xlsx and b/tests/test_files/test.xlsx differ