chore: excel improvements
This commit is contained in:
parent
81e3f24acd
commit
5de769f1bc
3 changed files with 26 additions and 4 deletions
|
|
@ -80,9 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
if href:
|
if href:
|
||||||
try:
|
try:
|
||||||
parsed_url = urlparse(href) # type: ignore
|
parsed_url = urlparse(href) # type: ignore
|
||||||
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
if parsed_url.scheme and parsed_url.scheme.lower() not in [
|
||||||
|
"http",
|
||||||
|
"https",
|
||||||
|
"file",
|
||||||
|
]: # type: ignore
|
||||||
return "%s%s%s" % (prefix, text, suffix)
|
return "%s%s%s" % (prefix, text, suffix)
|
||||||
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
href = urlunparse(
|
||||||
|
parsed_url._replace(path=quote(unquote(parsed_url.path)))
|
||||||
|
) # type: ignore
|
||||||
except ValueError: # It's not clear if this ever gets thrown
|
except ValueError: # It's not clear if this ever gets thrown
|
||||||
return "%s%s%s" % (prefix, text, suffix)
|
return "%s%s%s" % (prefix, text, suffix)
|
||||||
|
|
||||||
|
|
@ -504,6 +510,11 @@ class XlsxConverter(HtmlConverter):
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def _clean_colname(self, colname: str | Any) -> str | Any:
|
||||||
|
if isinstance(colname, str) and colname.startswith("Unnamed:"):
|
||||||
|
return ""
|
||||||
|
return colname
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLSX
|
# Bail if not a XLSX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|
@ -514,7 +525,13 @@ class XlsxConverter(HtmlConverter):
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
md_content += f"## {s}\n"
|
md_content += f"## {s}\n"
|
||||||
html_content = sheets[s].to_html(index=False)
|
sheet = sheets[s]
|
||||||
|
sheet.columns = list(map(self._clean_colname, sheet.columns))
|
||||||
|
html_content = (
|
||||||
|
sheet.dropna(how="all", axis=1)
|
||||||
|
.dropna(how="all", axis=0)
|
||||||
|
.to_html(index=False, na_rep="")
|
||||||
|
)
|
||||||
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
|
|
@ -629,7 +646,9 @@ class MediaConverter(DocumentConverter):
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[exiftool, "-json", local_path], capture_output=True, text=True
|
[exiftool, "-json", local_path],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
).stdout
|
).stdout
|
||||||
return json.loads(result)[0]
|
return json.loads(result)[0]
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -120,9 +120,12 @@ def test_markitdown_local() -> None:
|
||||||
|
|
||||||
# Test XLSX processing
|
# Test XLSX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||||
|
# Check assertions
|
||||||
for test_string in XLSX_TEST_STRINGS:
|
for test_string in XLSX_TEST_STRINGS:
|
||||||
text_content = result.text_content.replace("\\", "")
|
text_content = result.text_content.replace("\\", "")
|
||||||
assert test_string in text_content
|
assert test_string in text_content
|
||||||
|
# Check negations
|
||||||
|
assert "Unnamed:" not in text_content
|
||||||
|
|
||||||
# Test DOCX processing
|
# Test DOCX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue