This commit is contained in:
Li Yang 2025-04-20 17:01:26 -04:00 committed by GitHub
commit 6fdb65a1ab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 72 additions and 37 deletions

View file

@ -33,15 +33,68 @@ ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
class XlsxConverter(DocumentConverter):
"""
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
class ExcelConverterBase(DocumentConverter):
"""Base class for Excel-like converters"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def _clean_colname(self, colname: Any) -> Any:
# Remove Pandas header placeholders
if isinstance(colname, str) and colname.startswith("Unnamed:"):
return None
return colname
def _convert_excel(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
engine: str,
na_rep: Any = "",
remove_header_placeholders: bool = True,
drop_empty_cols: bool = False,
drop_empty_rows: bool = False,
**kwargs: Any,
) -> DocumentConverterResult:
sheets = pd.read_excel(file_stream, sheet_name=None, engine=engine)
md_content = ""
for name, sheet in sheets.items():
md_content += f"## {name}\n"
if remove_header_placeholders:
sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
if drop_empty_cols:
# Also consider headers to be part of the column
sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()]
if drop_empty_rows:
sheet = sheet.dropna(axis=0, how="all")
# Coerce any cell that evaluates to `pd.isna(c) == True` to `na_rep`
# More reliable than using `.to_html(na_rep=...)`: https://github.com/pandas-dev/pandas/issues/11953
# Because the latter does not replace NaT's
with pd.option_context("future.no_silent_downcasting", True):
sheet = sheet.fillna(na_rep, axis=1).infer_objects(copy=False)
sheet.columns = sheet.columns.fillna(na_rep).infer_objects(copy=False)
html_content = sheet.to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip())
class XlsxConverter(ExcelConverterBase):
"""
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
def accepts(
self,
file_stream: BinaryIO,
@ -80,30 +133,19 @@ class XlsxConverter(DocumentConverter):
_xlsx_dependency_exc_info[2]
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip())
return self._convert_excel(
file_stream=file_stream,
stream_info=stream_info,
engine="openpyxl",
**kwargs,
)
class XlsConverter(DocumentConverter):
class XlsConverter(ExcelConverterBase):
"""
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
@ -142,16 +184,9 @@ class XlsConverter(DocumentConverter):
_xls_dependency_exc_info[2]
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip())
return self._convert_excel(
file_stream=file_stream,
stream_info=stream_info,
engine="xlrd",
**kwargs,
)

View file

@ -41,7 +41,7 @@ GENERAL_TEST_VECTORS = [
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
],
must_not_include=[],
must_not_include=["Unnamed:", "NaN"],
),
FileTestVector(
filename="test.xls",
@ -53,7 +53,7 @@ GENERAL_TEST_VECTORS = [
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
],
must_not_include=[],
must_not_include=["Unnamed:", "NaN"],
),
FileTestVector(
filename="test.pptx",

Binary file not shown.