Fixed format errors, tests, and read charset when available.

2025-04-13 09:10:50 -07:00 · 2025-04-13 09:10:50 -07:00 · 81f8a69a75
commit 81f8a69a75
parent 44a77509f3
2 changed files with 37 additions and 17 deletions
--- a/packages/markitdown/src/markitdown/converters/_csv_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py
@ -2,6 +2,7 @@ import sys
 import csv
 import io
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
@ -12,10 +13,12 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
 ]
 ACCEPTED_FILE_EXTENSIONS = [".csv"]
 class CsvConverter(DocumentConverter):
    """
    Converts CSV files to Markdown tables.
    """
    def __init__(self):
        super().__init__()
@ -41,33 +44,36 @@ class CsvConverter(DocumentConverter):
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Read the file content
-        content = file_stream.read().decode('utf-8', errors='replace')
+        if stream_info.charset:
-        
+            content = file_stream.read().decode(stream_info.charset)
        else:
            content = str(from_bytes(file_stream.read()).best())
        # Parse CSV content
        reader = csv.reader(io.StringIO(content))
        rows = list(reader)
-        
+
        if not rows:
            return DocumentConverterResult(markdown="")
-        
+
        # Create markdown table
        markdown_table = []
-        
+
        # Add header row
        markdown_table.append("| " + " | ".join(rows[0]) + " |")
-        
+
        # Add separator row
        markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
-        
+
        # Add data rows
        for row in rows[1:]:
            # Make sure row has the same number of columns as header
            while len(row) < len(rows[0]):
                row.append("")
            # Truncate if row has more columns than header
-            row = row[:len(rows[0])]
+            row = row[: len(rows[0])]
            markdown_table.append("| " + " | ".join(row) + " |")
-        
+
        result = "\n".join(markdown_table)
-        
+
        return DocumentConverterResult(markdown=result)
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -24,14 +24,28 @@ try:
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
    # Define these types for type hinting when the package is not available
-    class AzureKeyCredential: pass
+    class AzureKeyCredential:
-    class TokenCredential: pass
+        pass
-    class DocumentIntelligenceClient: pass
+
-    class AnalyzeDocumentRequest: pass
+    class TokenCredential:
-    class AnalyzeResult: pass
+        pass
-    class DocumentAnalysisFeature: pass
+
-    class DefaultAzureCredential: pass
+    class DocumentIntelligenceClient:
        pass
    class AnalyzeDocumentRequest:
        pass
    class AnalyzeResult:
        pass
    class DocumentAnalysisFeature:
        pass
    class DefaultAzureCredential:
        pass
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.