diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..daf1127 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -80,9 +80,15 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): if href: try: parsed_url = urlparse(href) # type: ignore - if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in [ + "http", + "https", + "file", + ]: # type: ignore return "%s%s%s" % (prefix, text, suffix) - href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + href = urlunparse( + parsed_url._replace(path=quote(unquote(parsed_url.path))) + ) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) @@ -504,6 +510,11 @@ class XlsxConverter(HtmlConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ + def _clean_colname(self, colname: str | Any) -> str | Any: + if isinstance(colname, str) and colname.startswith("Unnamed:"): + return "" + return colname + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") @@ -514,7 +525,13 @@ class XlsxConverter(HtmlConverter): md_content = "" for s in sheets: md_content += f"## {s}\n" - html_content = sheets[s].to_html(index=False) + sheet = sheets[s] + sheet.columns = list(map(self._clean_colname, sheet.columns)) + html_content = ( + sheet.dropna(how="all", axis=1) + .dropna(how="all", axis=0) + .to_html(index=False, na_rep="") + ) md_content += self._convert(html_content).text_content.strip() + "\n\n" return DocumentConverterResult( @@ -629,7 +646,9 @@ class MediaConverter(DocumentConverter): else: try: result = subprocess.run( - [exiftool, "-json", local_path], capture_output=True, text=True + [exiftool, "-json", local_path], + capture_output=True, + text=True, ).stdout return json.loads(result)[0] except Exception: diff --git a/tests/test_files/test.xlsx b/tests/test_files/test.xlsx index 3a41e17..56ec497 100755 Binary files a/tests/test_files/test.xlsx and b/tests/test_files/test.xlsx differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 94fd886..edbeefe 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -120,9 +120,12 @@ def test_markitdown_local() -> None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) + # Check assertions for test_string in XLSX_TEST_STRINGS: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Check negations + assert "Unnamed:" not in text_content # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))