Merge b1748afa4d into 041be54471

2025-04-20 17:01:26 -04:00 · 2025-04-20 17:01:26 -04:00 · 6fdb65a1ab
commit 6fdb65a1ab
parent 041be54471 b1748afa4d
3 changed files with 72 additions and 37 deletions
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -33,15 +33,68 @@ ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
 ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]


-class XlsxConverter(DocumentConverter):
-    """
-    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
-    """
+class ExcelConverterBase(DocumentConverter):
+    """Base class for Excel-like converters"""

    def __init__(self):
        super().__init__()
        self._html_converter = HtmlConverter()

+    def _clean_colname(self, colname: Any) -> Any:
+        # Remove Pandas header placeholders
+        if isinstance(colname, str) and colname.startswith("Unnamed:"):
+            return None
+        return colname
+
+    def _convert_excel(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        engine: str,
+        na_rep: Any = "",
+        remove_header_placeholders: bool = True,
+        drop_empty_cols: bool = False,
+        drop_empty_rows: bool = False,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        sheets = pd.read_excel(file_stream, sheet_name=None, engine=engine)
+        md_content = ""
+        for name, sheet in sheets.items():
+            md_content += f"## {name}\n"
+
+            if remove_header_placeholders:
+                sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
+
+            if drop_empty_cols:
+                # Also consider headers to be part of the column
+                sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()]
+
+            if drop_empty_rows:
+                sheet = sheet.dropna(axis=0, how="all")
+
+            # Coerce any cell that evaluates to `pd.isna(c) == True` to `na_rep`
+            # More reliable than using `.to_html(na_rep=...)`: https://github.com/pandas-dev/pandas/issues/11953
+            # Because the latter does not replace NaT's
+            with pd.option_context("future.no_silent_downcasting", True):
+                sheet = sheet.fillna(na_rep, axis=1).infer_objects(copy=False)
+                sheet.columns = sheet.columns.fillna(na_rep).infer_objects(copy=False)
+
+            html_content = sheet.to_html(index=False)
+            md_content += (
+                self._html_converter.convert_string(
+                    html_content, **kwargs
+                ).markdown.strip()
+                + "\n\n"
+            )
+
+        return DocumentConverterResult(markdown=md_content.strip())
+
+
+class XlsxConverter(ExcelConverterBase):
+    """
+    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+
    def accepts(
        self,
        file_stream: BinaryIO,
@ -80,30 +133,19 @@ class XlsxConverter(DocumentConverter):
                _xlsx_dependency_exc_info[2]
            )

-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
-        md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
-
-        return DocumentConverterResult(markdown=md_content.strip())
+        return self._convert_excel(
+            file_stream=file_stream,
+            stream_info=stream_info,
+            engine="openpyxl",
+            **kwargs,
+        )


-class XlsConverter(DocumentConverter):
+class XlsConverter(ExcelConverterBase):
    """
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
-
    def accepts(
        self,
        file_stream: BinaryIO,
@ -142,16 +184,9 @@ class XlsConverter(DocumentConverter):
                _xls_dependency_exc_info[2]
            )

-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
-        md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
-
-        return DocumentConverterResult(markdown=md_content.strip())
+        return self._convert_excel(
+            file_stream=file_stream,
+            stream_info=stream_info,
+            engine="xlrd",
+            **kwargs,
+        )
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@ -41,7 +41,7 @@ GENERAL_TEST_VECTORS = [
            "6ff4173b-42a5-4784-9b19-f49caff4d93d",
            "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
        ],
-        must_not_include=[],
+        must_not_include=["Unnamed:", "NaN"],
    ),
    FileTestVector(
        filename="test.xls",
@ -53,7 +53,7 @@ GENERAL_TEST_VECTORS = [
            "6ff4173b-42a5-4784-9b19-f49caff4d93d",
            "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
        ],
-        must_not_include=[],
+        must_not_include=["Unnamed:", "NaN"],
    ),
    FileTestVector(
        filename="test.pptx",
--- a/packages/markitdown/tests/test_files/test.xlsx
+++ b/packages/markitdown/tests/test_files/test.xlsx