From a3cb24a536a4b7a0c227307c73120b8af98f87f7 Mon Sep 17 00:00:00 2001 From: Benny Yen Date: Tue, 8 Apr 2025 21:04:32 +0800 Subject: [PATCH] feat(csv): add CSV conversion support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Introduce a new CsvConverter for converting CSV files into Markdown tables. - Register CsvConverter in MarkItDown’s converter initialization in _markitdown.py. - Update converters/__init__.py to export CsvConverter. - Add _csv_converter.py to implement CSV conversion using pandas and openpyxl. - Revise test vectors to expect the Markdown table format produced by CsvConverter. - Adjust test_cli_vectors header for proper encoding handling. This change enables CSV file support in MarkItDown while keeping existing features intact. --- .../markitdown/src/markitdown/_markitdown.py | 2 + .../src/markitdown/converters/__init__.py | 2 + .../markitdown/converters/_csv_converter.py | 75 +++++++++++++++++++ packages/markitdown/tests/_test_vectors.py | 9 ++- 4 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 packages/markitdown/src/markitdown/converters/_csv_converter.py diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 54a0dc8..91f752e 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -32,6 +32,7 @@ from .converters import ( BingSerpConverter, PdfConverter, DocxConverter, + CsvConverter, XlsxConverter, XlsConverter, PptxConverter, @@ -185,6 +186,7 @@ class MarkItDown: self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) + self.register_converter(CsvConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index c68d0c3..c221e15 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -12,6 +12,7 @@ from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter from ._xlsx_converter import XlsxConverter, XlsConverter +from ._csv_converter import CsvConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter from ._audio_converter import AudioConverter @@ -33,6 +34,7 @@ __all__ = [ "BingSerpConverter", "PdfConverter", "DocxConverter", + "CsvConverter", "XlsxConverter", "XlsConverter", "PptxConverter", diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py new file mode 100644 index 0000000..b051864 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -0,0 +1,75 @@ +import sys +from typing import BinaryIO, Any +from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import pandas as pd +except ImportError: + _dependency_exc_info = sys.exc_info() + + +ACCEPTED_CSV_MIME_TYPE_PREFIXES = [ + "text/csv", + "application/csv" +] +ACCEPTED_CSV_FILE_EXTENSIONS = [".csv"] + +class CsvConverter(DocumentConverter): + """ + Converts CSV files to Markdown. + """ + + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_CSV_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_CSV_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ): + # Check: the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".csv", + feature="csv", + ) + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + content = pd.read_csv(file_stream, encoding=encoding) + md_content = self._html_converter.convert_string( + content.to_html(index=False), **kwargs + ).markdown.strip() + return DocumentConverterResult(markdown=md_content) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 4a7b54a..74fa9bd 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [ charset="cp932", url=None, must_include=[ - "名前,年齢,住所", - "佐藤太郎,30,東京", - "三木英子,25,大阪", - "髙橋淳,35,名古屋", + "| 名前 | 年齢 | 住所 |", + "| --- | --- | --- |", + "| 佐藤太郎 | 30 | 東京 |", + "| 三木英子 | 25 | 大阪 |", + "| 髙橋淳 | 35 | 名古屋 |", ], must_not_include=[], ),