diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 54a0dc8..682902b 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -41,6 +41,7 @@ from .converters import ( ZipConverter, EpubConverter, DocumentIntelligenceConverter, + CsvConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -194,6 +195,7 @@ class MarkItDown: self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) + self.register_converter(CsvConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index c68d0c3..e4437a5 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -22,6 +22,7 @@ from ._doc_intel_converter import ( DocumentIntelligenceFileType, ) from ._epub_converter import EpubConverter +from ._csv_converter import CsvConverter __all__ = [ "PlainTextConverter", @@ -43,4 +44,5 @@ __all__ = [ "DocumentIntelligenceConverter", "DocumentIntelligenceFileType", "EpubConverter", + "CsvConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py new file mode 100644 index 0000000..2064674 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -0,0 +1,73 @@ +import sys +import csv +import io +from typing import BinaryIO, Any +from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/csv", + "application/csv", +] +ACCEPTED_FILE_EXTENSIONS = [".csv"] + +class CsvConverter(DocumentConverter): + """ + Converts CSV files to Markdown tables. + """ + def __init__(self): + super().__init__() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Read the file content + content = file_stream.read().decode('utf-8', errors='replace') + + # Parse CSV content + reader = csv.reader(io.StringIO(content)) + rows = list(reader) + + if not rows: + return DocumentConverterResult(markdown="") + + # Create markdown table + markdown_table = [] + + # Add header row + markdown_table.append("| " + " | ".join(rows[0]) + " |") + + # Add separator row + markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") + + # Add data rows + for row in rows[1:]: + # Make sure row has the same number of columns as header + while len(row) < len(rows[0]): + row.append("") + # Truncate if row has more columns than header + row = row[:len(rows[0])] + markdown_table.append("| " + " | ".join(row) + " |") + + result = "\n".join(markdown_table) + + return DocumentConverterResult(markdown=result) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 5f4069b..15daa42 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -1,8 +1,7 @@ import sys import re import os - -from typing import BinaryIO, Any, List +from typing import BinaryIO, Any, List, Optional, Union from enum import Enum from ._html_converter import HtmlConverter @@ -25,6 +24,14 @@ try: except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() + # Define these types for type hinting when the package is not available + class AzureKeyCredential: pass + class TokenCredential: pass + class DocumentIntelligenceClient: pass + class AnalyzeDocumentRequest: pass + class AnalyzeResult: pass + class DocumentAnalysisFeature: pass + class DefaultAzureCredential: pass # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.