diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 376c75c..a72a963 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -527,7 +527,16 @@ class XlsxConverter(HtmlConverter): return "" return colname - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: + return ( + df.rename(columns=lambda col: self._clean_colname(col)) + .dropna(how="all", axis=1) + .dropna(how="all", axis=0) + ) + + def convert( + self, local_path, beautify: bool = True, **kwargs + ) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") if extension.lower() != ".xlsx": @@ -535,14 +544,13 @@ class XlsxConverter(HtmlConverter): sheets = pd.read_excel(local_path, sheet_name=None) md_content = "" - for s in sheets: - md_content += f"## {s}\n" - sheet = sheets[s] - sheet.columns = list(map(self._clean_colname, sheet.columns)) + for name, sheet in sheets.items(): + md_content += f"## {name}\n" + df = self._clean_dataframe(sheet) if beautify else sheet html_content = ( - sheet.dropna(how="all", axis=1) - .dropna(how="all", axis=0) - .to_html(index=False, na_rep="") + df.to_html(index=False, na_rep="") + if beautify + else df.to_html(index=False) ) md_content += self._convert(html_content).text_content.strip() + "\n\n" diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 2f061dc..bb666e9 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -42,6 +42,7 @@ XLSX_TEST_STRINGS = [ "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", ] + DOCX_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", @@ -139,14 +140,18 @@ def test_markitdown_local() -> None: markitdown = MarkItDown() # Test XLSX processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + # XlsxConverter has an additional kwarg `beautify`, which defaults to True + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "test.xlsx"), beautify=False + ) + result_cleaned = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) # Check assertions for test_string in XLSX_TEST_STRINGS: text_content = result.text_content.replace("\\", "") assert test_string in text_content # Check negations - assert "Unnamed:" not in text_content - assert "NaN" not in text_content + assert "Unnamed:" not in result_cleaned.text_content + assert "NaN" not in result_cleaned.text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))