feat: Add CSV to Markdown table converter

- Add new CsvConverter class to convert CSV files to Markdown tables\n- Support text/csv and application/csv MIME types\n- Handle UTF-8 encoded files with error replacement\n- Preserve table structure with headers and data rows\n- Handle edge cases like empty cells and mismatched columns\n- Fix Azure Document Intelligence dependency handling\n- Register CsvConverter in MarkItDown class
2025-04-09 13:53:26 +05:00 · 2025-04-09 13:53:26 +05:00 · 44a77509f3
commit 44a77509f3
parent 3fcd48cdfc
4 changed files with 86 additions and 2 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -41,6 +41,7 @@ from .converters import (
    ZipConverter,
    EpubConverter,
    DocumentIntelligenceConverter,
+    CsvConverter,
 )

 from ._base_converter import DocumentConverter, DocumentConverterResult
@ -194,6 +195,7 @@ class MarkItDown:
            self.register_converter(PdfConverter())
            self.register_converter(OutlookMsgConverter())
            self.register_converter(EpubConverter())
+            self.register_converter(CsvConverter())

            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@ -22,6 +22,7 @@ from ._doc_intel_converter import (
    DocumentIntelligenceFileType,
 )
 from ._epub_converter import EpubConverter
+from ._csv_converter import CsvConverter

 __all__ = [
    "PlainTextConverter",
@ -43,4 +44,5 @@ __all__ = [
    "DocumentIntelligenceConverter",
    "DocumentIntelligenceFileType",
    "EpubConverter",
+    "CsvConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_csv_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py
@ -0,0 +1,73 @@
+import sys
+import csv
+import io
+from typing import BinaryIO, Any
+from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "text/csv",
+    "application/csv",
+]
+ACCEPTED_FILE_EXTENSIONS = [".csv"]
+
+class CsvConverter(DocumentConverter):
+    """
+    Converts CSV files to Markdown tables.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Read the file content
+        content = file_stream.read().decode('utf-8', errors='replace')
+        
+        # Parse CSV content
+        reader = csv.reader(io.StringIO(content))
+        rows = list(reader)
+        
+        if not rows:
+            return DocumentConverterResult(markdown="")
+        
+        # Create markdown table
+        markdown_table = []
+        
+        # Add header row
+        markdown_table.append("| " + " | ".join(rows[0]) + " |")
+        
+        # Add separator row
+        markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
+        
+        # Add data rows
+        for row in rows[1:]:
+            # Make sure row has the same number of columns as header
+            while len(row) < len(rows[0]):
+                row.append("")
+            # Truncate if row has more columns than header
+            row = row[:len(rows[0])]
+            markdown_table.append("| " + " | ".join(row) + " |")
+        
+        result = "\n".join(markdown_table)
+        
+        return DocumentConverterResult(markdown=result)
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -1,8 +1,7 @@
 import sys
 import re
 import os
-
-from typing import BinaryIO, Any, List
+from typing import BinaryIO, Any, List, Optional, Union
 from enum import Enum

 from ._html_converter import HtmlConverter
@ -25,6 +24,14 @@ try:
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
+    # Define these types for type hinting when the package is not available
+    class AzureKeyCredential: pass
+    class TokenCredential: pass
+    class DocumentIntelligenceClient: pass
+    class AnalyzeDocumentRequest: pass
+    class AnalyzeResult: pass
+    class DocumentAnalysisFeature: pass
+    class DefaultAzureCredential: pass


 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.