feat(csv): add CSV conversion support
- Introduce a new CsvConverter for converting CSV files into Markdown tables. - Register CsvConverter in MarkItDown’s converter initialization in _markitdown.py. - Update converters/__init__.py to export CsvConverter. - Add _csv_converter.py to implement CSV conversion using pandas and openpyxl. - Revise test vectors to expect the Markdown table format produced by CsvConverter. - Adjust test_cli_vectors header for proper encoding handling. This change enables CSV file support in MarkItDown while keeping existing features intact.
This commit is contained in:
parent
3fcd48cdfc
commit
a3cb24a536
4 changed files with 84 additions and 4 deletions
|
|
@ -32,6 +32,7 @@ from .converters import (
|
|||
BingSerpConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
CsvConverter,
|
||||
XlsxConverter,
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
|
|
@ -185,6 +186,7 @@ class MarkItDown:
|
|||
self.register_converter(YouTubeConverter())
|
||||
self.register_converter(BingSerpConverter())
|
||||
self.register_converter(DocxConverter())
|
||||
self.register_converter(CsvConverter())
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ from ._bing_serp_converter import BingSerpConverter
|
|||
from ._pdf_converter import PdfConverter
|
||||
from ._docx_converter import DocxConverter
|
||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._csv_converter import CsvConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
from ._audio_converter import AudioConverter
|
||||
|
|
@ -33,6 +34,7 @@ __all__ = [
|
|||
"BingSerpConverter",
|
||||
"PdfConverter",
|
||||
"DocxConverter",
|
||||
"CsvConverter",
|
||||
"XlsxConverter",
|
||||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,75 @@
|
|||
import sys
|
||||
from typing import BinaryIO, Any
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_CSV_MIME_TYPE_PREFIXES = [
|
||||
"text/csv",
|
||||
"application/csv"
|
||||
]
|
||||
ACCEPTED_CSV_FILE_EXTENSIONS = [".csv"]
|
||||
|
||||
class CsvConverter(DocumentConverter):
|
||||
"""
|
||||
Converts CSV files to Markdown.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_CSV_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_CSV_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
):
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".csv",
|
||||
feature="csv",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
content = pd.read_csv(file_stream, encoding=encoding)
|
||||
md_content = self._html_converter.convert_string(
|
||||
content.to_html(index=False), **kwargs
|
||||
).markdown.strip()
|
||||
return DocumentConverterResult(markdown=md_content)
|
||||
|
|
@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [
|
|||
charset="cp932",
|
||||
url=None,
|
||||
must_include=[
|
||||
"名前,年齢,住所",
|
||||
"佐藤太郎,30,東京",
|
||||
"三木英子,25,大阪",
|
||||
"髙橋淳,35,名古屋",
|
||||
"| 名前 | 年齢 | 住所 |",
|
||||
"| --- | --- | --- |",
|
||||
"| 佐藤太郎 | 30 | 東京 |",
|
||||
"| 三木英子 | 25 | 大阪 |",
|
||||
"| 髙橋淳 | 35 | 名古屋 |",
|
||||
],
|
||||
must_not_include=[],
|
||||
),
|
||||
|
|
|
|||
Loading…
Reference in a new issue