Fixed format errors, tests, and read charset when available.

This commit is contained in:
Adam Fourney 2025-04-13 09:10:50 -07:00
parent 44a77509f3
commit 81f8a69a75
2 changed files with 37 additions and 17 deletions

View file

@ -2,6 +2,7 @@ import sys
import csv import csv
import io import io
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
@ -12,10 +13,12 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
] ]
ACCEPTED_FILE_EXTENSIONS = [".csv"] ACCEPTED_FILE_EXTENSIONS = [".csv"]
class CsvConverter(DocumentConverter): class CsvConverter(DocumentConverter):
""" """
Converts CSV files to Markdown tables. Converts CSV files to Markdown tables.
""" """
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -41,33 +44,36 @@ class CsvConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Read the file content # Read the file content
content = file_stream.read().decode('utf-8', errors='replace') if stream_info.charset:
content = file_stream.read().decode(stream_info.charset)
else:
content = str(from_bytes(file_stream.read()).best())
# Parse CSV content # Parse CSV content
reader = csv.reader(io.StringIO(content)) reader = csv.reader(io.StringIO(content))
rows = list(reader) rows = list(reader)
if not rows: if not rows:
return DocumentConverterResult(markdown="") return DocumentConverterResult(markdown="")
# Create markdown table # Create markdown table
markdown_table = [] markdown_table = []
# Add header row # Add header row
markdown_table.append("| " + " | ".join(rows[0]) + " |") markdown_table.append("| " + " | ".join(rows[0]) + " |")
# Add separator row # Add separator row
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
# Add data rows # Add data rows
for row in rows[1:]: for row in rows[1:]:
# Make sure row has the same number of columns as header # Make sure row has the same number of columns as header
while len(row) < len(rows[0]): while len(row) < len(rows[0]):
row.append("") row.append("")
# Truncate if row has more columns than header # Truncate if row has more columns than header
row = row[:len(rows[0])] row = row[: len(rows[0])]
markdown_table.append("| " + " | ".join(row) + " |") markdown_table.append("| " + " | ".join(row) + " |")
result = "\n".join(markdown_table) result = "\n".join(markdown_table)
return DocumentConverterResult(markdown=result) return DocumentConverterResult(markdown=result)

View file

@ -24,14 +24,28 @@ try:
except ImportError: except ImportError:
# Preserve the error and stack trace for later # Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info() _dependency_exc_info = sys.exc_info()
# Define these types for type hinting when the package is not available # Define these types for type hinting when the package is not available
class AzureKeyCredential: pass class AzureKeyCredential:
class TokenCredential: pass pass
class DocumentIntelligenceClient: pass
class AnalyzeDocumentRequest: pass class TokenCredential:
class AnalyzeResult: pass pass
class DocumentAnalysisFeature: pass
class DefaultAzureCredential: pass class DocumentIntelligenceClient:
pass
class AnalyzeDocumentRequest:
pass
class AnalyzeResult:
pass
class DocumentAnalysisFeature:
pass
class DefaultAzureCredential:
pass
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.