diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 2064674..7162889 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -2,6 +2,7 @@ import sys import csv import io from typing import BinaryIO, Any +from charset_normalizer import from_bytes from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo @@ -12,10 +13,12 @@ ACCEPTED_MIME_TYPE_PREFIXES = [ ] ACCEPTED_FILE_EXTENSIONS = [".csv"] + class CsvConverter(DocumentConverter): """ Converts CSV files to Markdown tables. """ + def __init__(self): super().__init__() @@ -41,33 +44,36 @@ class CsvConverter(DocumentConverter): **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Read the file content - content = file_stream.read().decode('utf-8', errors='replace') - + if stream_info.charset: + content = file_stream.read().decode(stream_info.charset) + else: + content = str(from_bytes(file_stream.read()).best()) + # Parse CSV content reader = csv.reader(io.StringIO(content)) rows = list(reader) - + if not rows: return DocumentConverterResult(markdown="") - + # Create markdown table markdown_table = [] - + # Add header row markdown_table.append("| " + " | ".join(rows[0]) + " |") - + # Add separator row markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") - + # Add data rows for row in rows[1:]: # Make sure row has the same number of columns as header while len(row) < len(rows[0]): row.append("") # Truncate if row has more columns than header - row = row[:len(rows[0])] + row = row[: len(rows[0])] markdown_table.append("| " + " | ".join(row) + " |") - + result = "\n".join(markdown_table) - + return DocumentConverterResult(markdown=result) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 15daa42..d2dce91 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -24,14 +24,28 @@ try: except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() + # Define these types for type hinting when the package is not available - class AzureKeyCredential: pass - class TokenCredential: pass - class DocumentIntelligenceClient: pass - class AnalyzeDocumentRequest: pass - class AnalyzeResult: pass - class DocumentAnalysisFeature: pass - class DefaultAzureCredential: pass + class AzureKeyCredential: + pass + + class TokenCredential: + pass + + class DocumentIntelligenceClient: + pass + + class AnalyzeDocumentRequest: + pass + + class AnalyzeResult: + pass + + class DocumentAnalysisFeature: + pass + + class DefaultAzureCredential: + pass # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.