Merge b1748afa4d into 041be54471
This commit is contained in:
commit
6fdb65a1ab
3 changed files with 72 additions and 37 deletions
|
|
@ -33,15 +33,68 @@ ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
||||||
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(DocumentConverter):
|
class ExcelConverterBase(DocumentConverter):
|
||||||
"""
|
"""Base class for Excel-like converters"""
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
|
def _clean_colname(self, colname: Any) -> Any:
|
||||||
|
# Remove Pandas header placeholders
|
||||||
|
if isinstance(colname, str) and colname.startswith("Unnamed:"):
|
||||||
|
return None
|
||||||
|
return colname
|
||||||
|
|
||||||
|
def _convert_excel(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
engine: str,
|
||||||
|
na_rep: Any = "",
|
||||||
|
remove_header_placeholders: bool = True,
|
||||||
|
drop_empty_cols: bool = False,
|
||||||
|
drop_empty_rows: bool = False,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine=engine)
|
||||||
|
md_content = ""
|
||||||
|
for name, sheet in sheets.items():
|
||||||
|
md_content += f"## {name}\n"
|
||||||
|
|
||||||
|
if remove_header_placeholders:
|
||||||
|
sheet = sheet.rename(columns=lambda col: self._clean_colname(col))
|
||||||
|
|
||||||
|
if drop_empty_cols:
|
||||||
|
# Also consider headers to be part of the column
|
||||||
|
sheet = sheet.loc[:, sheet.notna().any() | sheet.columns.notna()]
|
||||||
|
|
||||||
|
if drop_empty_rows:
|
||||||
|
sheet = sheet.dropna(axis=0, how="all")
|
||||||
|
|
||||||
|
# Coerce any cell that evaluates to `pd.isna(c) == True` to `na_rep`
|
||||||
|
# More reliable than using `.to_html(na_rep=...)`: https://github.com/pandas-dev/pandas/issues/11953
|
||||||
|
# Because the latter does not replace NaT's
|
||||||
|
with pd.option_context("future.no_silent_downcasting", True):
|
||||||
|
sheet = sheet.fillna(na_rep, axis=1).infer_objects(copy=False)
|
||||||
|
sheet.columns = sheet.columns.fillna(na_rep).infer_objects(copy=False)
|
||||||
|
|
||||||
|
html_content = sheet.to_html(index=False)
|
||||||
|
md_content += (
|
||||||
|
self._html_converter.convert_string(
|
||||||
|
html_content, **kwargs
|
||||||
|
).markdown.strip()
|
||||||
|
+ "\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
|
|
||||||
|
|
||||||
|
class XlsxConverter(ExcelConverterBase):
|
||||||
|
"""
|
||||||
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
|
"""
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -80,30 +133,19 @@ class XlsxConverter(DocumentConverter):
|
||||||
_xlsx_dependency_exc_info[2]
|
_xlsx_dependency_exc_info[2]
|
||||||
)
|
)
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
return self._convert_excel(
|
||||||
md_content = ""
|
file_stream=file_stream,
|
||||||
for s in sheets:
|
stream_info=stream_info,
|
||||||
md_content += f"## {s}\n"
|
engine="openpyxl",
|
||||||
html_content = sheets[s].to_html(index=False)
|
**kwargs,
|
||||||
md_content += (
|
|
||||||
self._html_converter.convert_string(
|
|
||||||
html_content, **kwargs
|
|
||||||
).markdown.strip()
|
|
||||||
+ "\n\n"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
|
||||||
|
|
||||||
|
class XlsConverter(ExcelConverterBase):
|
||||||
class XlsConverter(DocumentConverter):
|
|
||||||
"""
|
"""
|
||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self._html_converter = HtmlConverter()
|
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -142,16 +184,9 @@ class XlsConverter(DocumentConverter):
|
||||||
_xls_dependency_exc_info[2]
|
_xls_dependency_exc_info[2]
|
||||||
)
|
)
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
return self._convert_excel(
|
||||||
md_content = ""
|
file_stream=file_stream,
|
||||||
for s in sheets:
|
stream_info=stream_info,
|
||||||
md_content += f"## {s}\n"
|
engine="xlrd",
|
||||||
html_content = sheets[s].to_html(index=False)
|
**kwargs,
|
||||||
md_content += (
|
|
||||||
self._html_converter.convert_string(
|
|
||||||
html_content, **kwargs
|
|
||||||
).markdown.strip()
|
|
||||||
+ "\n\n"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ GENERAL_TEST_VECTORS = [
|
||||||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||||
],
|
],
|
||||||
must_not_include=[],
|
must_not_include=["Unnamed:", "NaN"],
|
||||||
),
|
),
|
||||||
FileTestVector(
|
FileTestVector(
|
||||||
filename="test.xls",
|
filename="test.xls",
|
||||||
|
|
@ -53,7 +53,7 @@ GENERAL_TEST_VECTORS = [
|
||||||
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
"6ff4173b-42a5-4784-9b19-f49caff4d93d",
|
||||||
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
"affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
|
||||||
],
|
],
|
||||||
must_not_include=[],
|
must_not_include=["Unnamed:", "NaN"],
|
||||||
),
|
),
|
||||||
FileTestVector(
|
FileTestVector(
|
||||||
filename="test.pptx",
|
filename="test.pptx",
|
||||||
|
|
|
||||||
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
BIN
packages/markitdown/tests/test_files/test.xlsx
vendored
Binary file not shown.
Loading…
Reference in a new issue