chore: excel improvements
This commit is contained in:
parent
81e3f24acd
commit
5de769f1bc
3 changed files with 26 additions and 4 deletions
|
|
@ -80,9 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
|||
if href:
|
||||
try:
|
||||
parsed_url = urlparse(href) # type: ignore
|
||||
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
||||
if parsed_url.scheme and parsed_url.scheme.lower() not in [
|
||||
"http",
|
||||
"https",
|
||||
"file",
|
||||
]: # type: ignore
|
||||
return "%s%s%s" % (prefix, text, suffix)
|
||||
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
||||
href = urlunparse(
|
||||
parsed_url._replace(path=quote(unquote(parsed_url.path)))
|
||||
) # type: ignore
|
||||
except ValueError: # It's not clear if this ever gets thrown
|
||||
return "%s%s%s" % (prefix, text, suffix)
|
||||
|
||||
|
|
@ -504,6 +510,11 @@ class XlsxConverter(HtmlConverter):
|
|||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def _clean_colname(self, colname: str | Any) -> str | Any:
|
||||
if isinstance(colname, str) and colname.startswith("Unnamed:"):
|
||||
return ""
|
||||
return colname
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLSX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
|
@ -514,7 +525,13 @@ class XlsxConverter(HtmlConverter):
|
|||
md_content = ""
|
||||
for s in sheets:
|
||||
md_content += f"## {s}\n"
|
||||
html_content = sheets[s].to_html(index=False)
|
||||
sheet = sheets[s]
|
||||
sheet.columns = list(map(self._clean_colname, sheet.columns))
|
||||
html_content = (
|
||||
sheet.dropna(how="all", axis=1)
|
||||
.dropna(how="all", axis=0)
|
||||
.to_html(index=False, na_rep="")
|
||||
)
|
||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
|
|
@ -629,7 +646,9 @@ class MediaConverter(DocumentConverter):
|
|||
else:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[exiftool, "-json", local_path], capture_output=True, text=True
|
||||
[exiftool, "-json", local_path],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
).stdout
|
||||
return json.loads(result)[0]
|
||||
except Exception:
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -120,9 +120,12 @@ def test_markitdown_local() -> None:
|
|||
|
||||
# Test XLSX processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||
# Check assertions
|
||||
for test_string in XLSX_TEST_STRINGS:
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
assert test_string in text_content
|
||||
# Check negations
|
||||
assert "Unnamed:" not in text_content
|
||||
|
||||
# Test DOCX processing
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
||||
|
|
|
|||
Loading…
Reference in a new issue