diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 28f73a0..0ddff7c 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -33,15 +33,68 @@ ACCEPTED_XLS_MIME_TYPE_PREFIXES = [ ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] -class XlsxConverter(DocumentConverter): - """ - Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. - """ +class ExcelConverterBase(DocumentConverter): + """Base class for Excel-like converters""" def __init__(self): super().__init__() self._html_converter = HtmlConverter() + def _clean_colname(self, colname: Any) -> Any: + # Remove Pandas header placeholders + if isinstance(colname, str) and colname.startswith("Unnamed:"): + return None + return colname + + def _convert_excel( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + engine: str, + na_rep: Any = "", + remove_header_placeholders: bool = True, + drop_empty_cols: bool = False, + drop_empty_rows: bool = False, + **kwargs: Any, + ) -> DocumentConverterResult: + sheets = pd.read_excel(file_stream, sheet_name=None, engine=engine) + md_content = "" + for name, sheet in sheets.items(): + md_content += f"## {name}\n" + + if remove_header_placeholders: + sheet = sheet.rename(columns=lambda col: self._clean_colname(col)) + + if drop_empty_cols: + # Also consider headers to be part of the column + sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()] + + if drop_empty_rows: + sheet = sheet.dropna(axis=0, how="all") + + # Coerce any cell that evaluates to `pd.isna(c) == True` to `na_rep` + # More reliable than using `.to_html(na_rep=...)`: https://github.com/pandas-dev/pandas/issues/11953 + # Because the latter does not replace NaT's + with pd.option_context("future.no_silent_downcasting", True): + sheet = sheet.fillna(na_rep, axis=1).infer_objects(copy=False) + sheet.columns = sheet.columns.fillna(na_rep).infer_objects(copy=False) + + html_content = sheet.to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + + return DocumentConverterResult(markdown=md_content.strip()) + + +class XlsxConverter(ExcelConverterBase): + """ + Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + """ + def accepts( self, file_stream: BinaryIO, @@ -80,30 +133,19 @@ class XlsxConverter(DocumentConverter): _xlsx_dependency_exc_info[2] ) - sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") - md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += ( - self._html_converter.convert_string( - html_content, **kwargs - ).markdown.strip() - + "\n\n" - ) - - return DocumentConverterResult(markdown=md_content.strip()) + return self._convert_excel( + file_stream=file_stream, + stream_info=stream_info, + engine="openpyxl", + **kwargs, + ) -class XlsConverter(DocumentConverter): +class XlsConverter(ExcelConverterBase): """ Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__(self): - super().__init__() - self._html_converter = HtmlConverter() - def accepts( self, file_stream: BinaryIO, @@ -142,16 +184,9 @@ class XlsConverter(DocumentConverter): _xls_dependency_exc_info[2] ) - sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") - md_content = "" - for s in sheets: - md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) - md_content += ( - self._html_converter.convert_string( - html_content, **kwargs - ).markdown.strip() - + "\n\n" - ) - - return DocumentConverterResult(markdown=md_content.strip()) + return self._convert_excel( + file_stream=file_stream, + stream_info=stream_info, + engine="xlrd", + **kwargs, + ) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd..4878e51 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -41,7 +41,7 @@ GENERAL_TEST_VECTORS = [ "6ff4173b-42a5-4784-9b19-f49caff4d93d", "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", ], - must_not_include=[], + must_not_include=["Unnamed:", "NaN"], ), FileTestVector( filename="test.xls", @@ -53,7 +53,7 @@ GENERAL_TEST_VECTORS = [ "6ff4173b-42a5-4784-9b19-f49caff4d93d", "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", ], - must_not_include=[], + must_not_include=["Unnamed:", "NaN"], ), FileTestVector( filename="test.pptx", diff --git a/packages/markitdown/tests/test_files/test.xlsx b/packages/markitdown/tests/test_files/test.xlsx index 3a41e17..9153d52 100644 Binary files a/packages/markitdown/tests/test_files/test.xlsx and b/packages/markitdown/tests/test_files/test.xlsx differ