Merge b1748afa4d into 041be54471
This commit is contained in:
commit
6fdb65a1ab
3 changed files with 72 additions and 37 deletions
|
|
@ -33,15 +33,68 @@ ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
|||
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
||||
|
||||
|
||||
class XlsxConverter(DocumentConverter):
|
||||
"""
|
||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
class ExcelConverterBase(DocumentConverter):
|
||||
"""Base class for Excel-like converters"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def _clean_colname(self, colname: Any) -> Any:
|
||||
# Remove Pandas header placeholders
|
||||
if isinstance(colname, str) and colname.startswith("Unnamed:"):
|
||||
return None
|
||||
return colname
|
||||
|
||||
def _convert_excel(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
engine: str,
|
||||
na_rep: Any = "",
|
||||
remove_header_placeholders: bool = True,
|
||||
drop_empty_cols: bool = False,
|
||||
drop_empty_rows: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine=engine)
|
||||
md_content = ""
|
||||
for name, sheet in sheets.items():
|
||||
md_content += f"## {name}\n"
|
||||
|
||||
if remove_header_placeholders:
|
||||
sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
|
||||
|
||||
if drop_empty_cols:
|
||||
# Also consider headers to be part of the column
|
||||
sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()]
|
||||
|
||||
if drop_empty_rows:
|
||||
sheet = sheet.dropna(axis=0, how="all")
|
||||
|
||||
# Coerce any cell that evaluates to `pd.isna(c) == True` to `na_rep`
|
||||
# More reliable than using `.to_html(na_rep=...)`: https://github.com/pandas-dev/pandas/issues/11953
|
||||
# Because the latter does not replace NaT's
|
||||
with pd.option_context("future.no_silent_downcasting", True):
|
||||
sheet = sheet.fillna(na_rep, axis=1).infer_objects(copy=False)
|
||||
sheet.columns = sheet.columns.fillna(na_rep).infer_objects(copy=False)
|
||||
|
||||
html_content = sheet.to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
|
||||
class XlsxConverter(ExcelConverterBase):
|
||||
"""
|
||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -80,30 +133,19 @@ class XlsxConverter(DocumentConverter):
|
|||
_xlsx_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
html_content = sheets[s].to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
return self._convert_excel(
|
||||
file_stream=file_stream,
|
||||
stream_info=stream_info,
|
||||
engine="openpyxl",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class XlsConverter(DocumentConverter):
|
||||
class XlsConverter(ExcelConverterBase):
|
||||
"""
|
||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
|
|
@ -142,16 +184,9 @@ class XlsConverter(DocumentConverter):
|
|||
_xls_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
html_content = sheets[s].to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
return self._convert_excel(
|
||||
file_stream=file_stream,
|
||||
stream_info=stream_info,
|
||||
engine="xlrd",
|
||||
**kwargs,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ GENERAL_TEST_VECTORS = [
|
|||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||
],
|
||||
must_not_include=[],
|
||||
must_not_include=["Unnamed:", "NaN"],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.xls",
|
||||
|
|
@ -53,7 +53,7 @@ GENERAL_TEST_VECTORS = [
|
|||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||
],
|
||||
must_not_include=[],
|
||||
must_not_include=["Unnamed:", "NaN"],
|
||||
),
|
||||
FileTestVector(
|
||||
filename="test.pptx",
|
||||
|
|
|
|||
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
Binary file not shown.
Loading…
Reference in a new issue