chore: excel improvements

This commit is contained in:
Hew Li Yang 2024-12-16 15:27:03 +08:00
parent 81e3f24acd
commit 5de769f1bc
3 changed files with 26 additions and 4 deletions

View file

@ -80,9 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
if href:
try:
parsed_url = urlparse(href) # type: ignore
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
if parsed_url.scheme and parsed_url.scheme.lower() not in [
"http",
"https",
"file",
]: # type: ignore
return "%s%s%s" % (prefix, text, suffix)
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
href = urlunparse(
parsed_url._replace(path=quote(unquote(parsed_url.path)))
) # type: ignore
except ValueError: # It's not clear if this ever gets thrown
return "%s%s%s" % (prefix, text, suffix)
@ -504,6 +510,11 @@ class XlsxConverter(HtmlConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
def _clean_colname(self, colname: str | Any) -> str | Any:
if isinstance(colname, str) and colname.startswith("Unnamed:"):
return ""
return colname
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
@ -514,7 +525,13 @@ class XlsxConverter(HtmlConverter):
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
sheet = sheets[s]
sheet.columns = list(map(self._clean_colname, sheet.columns))
html_content = (
sheet.dropna(how="all", axis=1)
.dropna(how="all", axis=0)
.to_html(index=False, na_rep="")
)
md_content += self._convert(html_content).text_content.strip() + "\n\n"
return DocumentConverterResult(
@ -629,7 +646,9 @@ class MediaConverter(DocumentConverter):
else:
try:
result = subprocess.run(
[exiftool, "-json", local_path], capture_output=True, text=True
[exiftool, "-json", local_path],
capture_output=True,
text=True,
).stdout
return json.loads(result)[0]
except Exception:

Binary file not shown.

View file

@ -120,9 +120,12 @@ def test_markitdown_local() -> None:
# Test XLSX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
# Check assertions
for test_string in XLSX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Check negations
assert "Unnamed:" not in text_content
# Test DOCX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))