Fixed format errors, tests, and read charset when available.

This commit is contained in:
Adam Fourney 2025-04-13 09:10:50 -07:00
parent 44a77509f3
commit 81f8a69a75
2 changed files with 37 additions and 17 deletions

View file

@ -2,6 +2,7 @@ import sys
import csv
import io
from typing import BinaryIO, Any
from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
@ -12,10 +13,12 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
]
ACCEPTED_FILE_EXTENSIONS = [".csv"]
class CsvConverter(DocumentConverter):
"""
Converts CSV files to Markdown tables.
"""
def __init__(self):
super().__init__()
@ -41,7 +44,10 @@ class CsvConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Read the file content
content = file_stream.read().decode('utf-8', errors='replace')
if stream_info.charset:
content = file_stream.read().decode(stream_info.charset)
else:
content = str(from_bytes(file_stream.read()).best())
# Parse CSV content
reader = csv.reader(io.StringIO(content))
@ -65,7 +71,7 @@ class CsvConverter(DocumentConverter):
while len(row) < len(rows[0]):
row.append("")
# Truncate if row has more columns than header
row = row[:len(rows[0])]
row = row[: len(rows[0])]
markdown_table.append("| " + " | ".join(row) + " |")
result = "\n".join(markdown_table)

View file

@ -24,14 +24,28 @@ try:
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
# Define these types for type hinting when the package is not available
class AzureKeyCredential: pass
class TokenCredential: pass
class DocumentIntelligenceClient: pass
class AnalyzeDocumentRequest: pass
class AnalyzeResult: pass
class DocumentAnalysisFeature: pass
class DefaultAzureCredential: pass
class AzureKeyCredential:
pass
class TokenCredential:
pass
class DocumentIntelligenceClient:
pass
class AnalyzeDocumentRequest:
pass
class AnalyzeResult:
pass
class DocumentAnalysisFeature:
pass
class DefaultAzureCredential:
pass
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.