feat(csv): add CSV conversion support
- Introduce a new CsvConverter for converting CSV files into Markdown tables. - Register CsvConverter in MarkItDown’s converter initialization in _markitdown.py. - Update converters/__init__.py to export CsvConverter. - Add _csv_converter.py to implement CSV conversion using pandas and openpyxl. - Revise test vectors to expect the Markdown table format produced by CsvConverter. - Adjust test_cli_vectors header for proper encoding handling. This change enables CSV file support in MarkItDown while keeping existing features intact.
This commit is contained in:
parent
3fcd48cdfc
commit
a3cb24a536
4 changed files with 84 additions and 4 deletions
|
|
@ -32,6 +32,7 @@ from .converters import (
|
||||||
BingSerpConverter,
|
BingSerpConverter,
|
||||||
PdfConverter,
|
PdfConverter,
|
||||||
DocxConverter,
|
DocxConverter,
|
||||||
|
CsvConverter,
|
||||||
XlsxConverter,
|
XlsxConverter,
|
||||||
XlsConverter,
|
XlsConverter,
|
||||||
PptxConverter,
|
PptxConverter,
|
||||||
|
|
@ -185,6 +186,7 @@ class MarkItDown:
|
||||||
self.register_converter(YouTubeConverter())
|
self.register_converter(YouTubeConverter())
|
||||||
self.register_converter(BingSerpConverter())
|
self.register_converter(BingSerpConverter())
|
||||||
self.register_converter(DocxConverter())
|
self.register_converter(DocxConverter())
|
||||||
|
self.register_converter(CsvConverter())
|
||||||
self.register_converter(XlsxConverter())
|
self.register_converter(XlsxConverter())
|
||||||
self.register_converter(XlsConverter())
|
self.register_converter(XlsConverter())
|
||||||
self.register_converter(PptxConverter())
|
self.register_converter(PptxConverter())
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ from ._bing_serp_converter import BingSerpConverter
|
||||||
from ._pdf_converter import PdfConverter
|
from ._pdf_converter import PdfConverter
|
||||||
from ._docx_converter import DocxConverter
|
from ._docx_converter import DocxConverter
|
||||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||||
|
from ._csv_converter import CsvConverter
|
||||||
from ._pptx_converter import PptxConverter
|
from ._pptx_converter import PptxConverter
|
||||||
from ._image_converter import ImageConverter
|
from ._image_converter import ImageConverter
|
||||||
from ._audio_converter import AudioConverter
|
from ._audio_converter import AudioConverter
|
||||||
|
|
@ -33,6 +34,7 @@ __all__ = [
|
||||||
"BingSerpConverter",
|
"BingSerpConverter",
|
||||||
"PdfConverter",
|
"PdfConverter",
|
||||||
"DocxConverter",
|
"DocxConverter",
|
||||||
|
"CsvConverter",
|
||||||
"XlsxConverter",
|
"XlsxConverter",
|
||||||
"XlsConverter",
|
"XlsConverter",
|
||||||
"PptxConverter",
|
"PptxConverter",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,75 @@
|
||||||
|
import sys
|
||||||
|
from typing import BinaryIO, Any
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
# Try loading optional (but in this case, required) dependencies
|
||||||
|
# Save reporting of any exceptions for later
|
||||||
|
_dependency_exc_info = None
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
except ImportError:
|
||||||
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
|
||||||
|
ACCEPTED_CSV_MIME_TYPE_PREFIXES = [
|
||||||
|
"text/csv",
|
||||||
|
"application/csv"
|
||||||
|
]
|
||||||
|
ACCEPTED_CSV_FILE_EXTENSIONS = [".csv"]
|
||||||
|
|
||||||
|
class CsvConverter(DocumentConverter):
|
||||||
|
"""
|
||||||
|
Converts CSV files to Markdown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_CSV_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_CSV_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
):
|
||||||
|
# Check: the dependencies
|
||||||
|
if _dependency_exc_info is not None:
|
||||||
|
raise MissingDependencyException(
|
||||||
|
MISSING_DEPENDENCY_MESSAGE.format(
|
||||||
|
converter=type(self).__name__,
|
||||||
|
extension=".csv",
|
||||||
|
feature="csv",
|
||||||
|
)
|
||||||
|
) from _dependency_exc_info[
|
||||||
|
1
|
||||||
|
].with_traceback( # type: ignore[union-attr]
|
||||||
|
_dependency_exc_info[2]
|
||||||
|
)
|
||||||
|
|
||||||
|
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||||
|
content = pd.read_csv(file_stream, encoding=encoding)
|
||||||
|
md_content = self._html_converter.convert_string(
|
||||||
|
content.to_html(index=False), **kwargs
|
||||||
|
).markdown.strip()
|
||||||
|
return DocumentConverterResult(markdown=md_content)
|
||||||
|
|
@ -144,10 +144,11 @@ GENERAL_TEST_VECTORS = [
|
||||||
charset="cp932",
|
charset="cp932",
|
||||||
url=None,
|
url=None,
|
||||||
must_include=[
|
must_include=[
|
||||||
"名前,年齢,住所",
|
"| 名前 | 年齢 | 住所 |",
|
||||||
"佐藤太郎,30,東京",
|
"| --- | --- | --- |",
|
||||||
"三木英子,25,大阪",
|
"| 佐藤太郎 | 30 | 東京 |",
|
||||||
"髙橋淳,35,名古屋",
|
"| 三木英子 | 25 | 大阪 |",
|
||||||
|
"| 髙橋淳 | 35 | 名古屋 |",
|
||||||
],
|
],
|
||||||
must_not_include=[],
|
must_not_include=[],
|
||||||
),
|
),
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue