diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index a72a963..67f31af 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -523,19 +523,18 @@ class XlsxConverter(HtmlConverter): """ def _clean_colname(self, colname: Any) -> Any: + # Remove Pandas header placeholders if isinstance(colname, str) and colname.startswith("Unnamed:"): return "" return colname - def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: - return ( - df.rename(columns=lambda col: self._clean_colname(col)) - .dropna(how="all", axis=1) - .dropna(how="all", axis=0) - ) - def convert( - self, local_path, beautify: bool = True, **kwargs + self, + local_path, + na_rep: Any = "", + drop_empty_cols: bool = False, + drop_empty_rows: bool = False, + **kwargs, ) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") @@ -546,12 +545,15 @@ class XlsxConverter(HtmlConverter): md_content = "" for name, sheet in sheets.items(): md_content += f"## {name}\n" - df = self._clean_dataframe(sheet) if beautify else sheet - html_content = ( - df.to_html(index=False, na_rep="") - if beautify - else df.to_html(index=False) - ) + sheet = sheet.rename(columns=lambda col: self._clean_colname(col)) + + if drop_empty_cols: + sheet = sheet.dropna(axis=1, how="all") + + if drop_empty_rows: + sheet = sheet.dropna(axis=0, how="all") + + html_content = sheet.to_html(index=False, na_rep=na_rep) md_content += self._convert(html_content).text_content.strip() + "\n\n" return DocumentConverterResult( diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index bb666e9..aeba9b4 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -140,18 +140,16 @@ def test_markitdown_local() -> None: markitdown = MarkItDown() # Test XLSX processing - # XlsxConverter has an additional kwarg `beautify`, which defaults to True - result = markitdown.convert( - os.path.join(TEST_FILES_DIR, "test.xlsx"), beautify=False - ) - result_cleaned = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + text_content = result.text_content.replace("\\", "") + # Check assertions for test_string in XLSX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Check negations - assert "Unnamed:" not in result_cleaned.text_content - assert "NaN" not in result_cleaned.text_content + assert "Unnamed:" not in result.text_content + assert "NaN" not in result.text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))