Fixed format errors, tests, and read charset when available.
This commit is contained in:
parent
44a77509f3
commit
81f8a69a75
2 changed files with 37 additions and 17 deletions
|
|
@ -2,6 +2,7 @@ import sys
|
|||
import csv
|
||||
import io
|
||||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
|
@ -12,10 +13,12 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
|
|||
]
|
||||
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
||||
|
||||
|
||||
class CsvConverter(DocumentConverter):
|
||||
"""
|
||||
Converts CSV files to Markdown tables.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
|
@ -41,7 +44,10 @@ class CsvConverter(DocumentConverter):
|
|||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Read the file content
|
||||
content = file_stream.read().decode('utf-8', errors='replace')
|
||||
if stream_info.charset:
|
||||
content = file_stream.read().decode(stream_info.charset)
|
||||
else:
|
||||
content = str(from_bytes(file_stream.read()).best())
|
||||
|
||||
# Parse CSV content
|
||||
reader = csv.reader(io.StringIO(content))
|
||||
|
|
|
|||
|
|
@ -24,14 +24,28 @@ try:
|
|||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
# Define these types for type hinting when the package is not available
|
||||
class AzureKeyCredential: pass
|
||||
class TokenCredential: pass
|
||||
class DocumentIntelligenceClient: pass
|
||||
class AnalyzeDocumentRequest: pass
|
||||
class AnalyzeResult: pass
|
||||
class DocumentAnalysisFeature: pass
|
||||
class DefaultAzureCredential: pass
|
||||
class AzureKeyCredential:
|
||||
pass
|
||||
|
||||
class TokenCredential:
|
||||
pass
|
||||
|
||||
class DocumentIntelligenceClient:
|
||||
pass
|
||||
|
||||
class AnalyzeDocumentRequest:
|
||||
pass
|
||||
|
||||
class AnalyzeResult:
|
||||
pass
|
||||
|
||||
class DocumentAnalysisFeature:
|
||||
pass
|
||||
|
||||
class DefaultAzureCredential:
|
||||
pass
|
||||
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
|
|
|
|||
Loading…
Reference in a new issue