feat(csv): add CSV conversion support

- Introduce a new CsvConverter for converting CSV files into Markdown tables.
- Register CsvConverter in MarkItDown’s converter initialization in _markitdown.py.
- Update converters/__init__.py to export CsvConverter.
- Add _csv_converter.py to implement CSV conversion using pandas and openpyxl.
- Revise test vectors to expect the Markdown table format produced by CsvConverter.
- Adjust test_cli_vectors header for proper encoding handling.

This change enables CSV file support in MarkItDown while keeping existing features intact.
This commit is contained in:
Benny Yen 2025-04-08 21:04:32 +08:00
parent 3fcd48cdfc
commit a3cb24a536
No known key found for this signature in database
GPG key ID: 3304E223FD5FEE40
4 changed files with 84 additions and 4 deletions

View file

@ -32,6 +32,7 @@ from .converters import (
BingSerpConverter, BingSerpConverter,
PdfConverter, PdfConverter,
DocxConverter, DocxConverter,
CsvConverter,
XlsxConverter, XlsxConverter,
XlsConverter, XlsConverter,
PptxConverter, PptxConverter,
@ -185,6 +186,7 @@ class MarkItDown:
self.register_converter(YouTubeConverter()) self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter()) self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter()) self.register_converter(DocxConverter())
self.register_converter(CsvConverter())
self.register_converter(XlsxConverter()) self.register_converter(XlsxConverter())
self.register_converter(XlsConverter()) self.register_converter(XlsConverter())
self.register_converter(PptxConverter()) self.register_converter(PptxConverter())

View file

@ -12,6 +12,7 @@ from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter from ._docx_converter import DocxConverter
from ._xlsx_converter import XlsxConverter, XlsConverter from ._xlsx_converter import XlsxConverter, XlsConverter
from ._csv_converter import CsvConverter
from ._pptx_converter import PptxConverter from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter from ._image_converter import ImageConverter
from ._audio_converter import AudioConverter from ._audio_converter import AudioConverter
@ -33,6 +34,7 @@ __all__ = [
"BingSerpConverter", "BingSerpConverter",
"PdfConverter", "PdfConverter",
"DocxConverter", "DocxConverter",
"CsvConverter",
"XlsxConverter", "XlsxConverter",
"XlsConverter", "XlsConverter",
"PptxConverter", "PptxConverter",

View file

@ -0,0 +1,75 @@
import sys
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pandas as pd
except ImportError:
_dependency_exc_info = sys.exc_info()
ACCEPTED_CSV_MIME_TYPE_PREFIXES = [
"text/csv",
"application/csv"
]
ACCEPTED_CSV_FILE_EXTENSIONS = [".csv"]
class CsvConverter(DocumentConverter):
"""
Converts CSV files to Markdown.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_CSV_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_CSV_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
):
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".csv",
feature="csv",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
content = pd.read_csv(file_stream, encoding=encoding)
md_content = self._html_converter.convert_string(
content.to_html(index=False), **kwargs
).markdown.strip()
return DocumentConverterResult(markdown=md_content)

View file

@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [
charset="cp932", charset="cp932",
url=None, url=None,
must_include=[ must_include=[
"名前,年齢,住所", "| 名前 | 年齢 | 住所 |",
"佐藤太郎,30,東京", "| --- | --- | --- |",
"三木英子,25,大阪", "| 佐藤太郎 | 30 | 東京 |",
"髙橋淳,35,名古屋", "| 三木英子 | 25 | 大阪 |",
"| 髙橋淳 | 35 | 名古屋 |",
], ],
must_not_include=[], must_not_include=[],
), ),