Fixed format errors, tests, and read charset when available.
This commit is contained in:
parent
44a77509f3
commit
81f8a69a75
2 changed files with 37 additions and 17 deletions
|
|
@ -2,6 +2,7 @@ import sys
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
|
from charset_normalizer import from_bytes
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
|
@ -12,10 +13,12 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
]
|
]
|
||||||
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
||||||
|
|
||||||
|
|
||||||
class CsvConverter(DocumentConverter):
|
class CsvConverter(DocumentConverter):
|
||||||
"""
|
"""
|
||||||
Converts CSV files to Markdown tables.
|
Converts CSV files to Markdown tables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
|
@ -41,33 +44,36 @@ class CsvConverter(DocumentConverter):
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Read the file content
|
# Read the file content
|
||||||
content = file_stream.read().decode('utf-8', errors='replace')
|
if stream_info.charset:
|
||||||
|
content = file_stream.read().decode(stream_info.charset)
|
||||||
|
else:
|
||||||
|
content = str(from_bytes(file_stream.read()).best())
|
||||||
|
|
||||||
# Parse CSV content
|
# Parse CSV content
|
||||||
reader = csv.reader(io.StringIO(content))
|
reader = csv.reader(io.StringIO(content))
|
||||||
rows = list(reader)
|
rows = list(reader)
|
||||||
|
|
||||||
if not rows:
|
if not rows:
|
||||||
return DocumentConverterResult(markdown="")
|
return DocumentConverterResult(markdown="")
|
||||||
|
|
||||||
# Create markdown table
|
# Create markdown table
|
||||||
markdown_table = []
|
markdown_table = []
|
||||||
|
|
||||||
# Add header row
|
# Add header row
|
||||||
markdown_table.append("| " + " | ".join(rows[0]) + " |")
|
markdown_table.append("| " + " | ".join(rows[0]) + " |")
|
||||||
|
|
||||||
# Add separator row
|
# Add separator row
|
||||||
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
|
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
|
||||||
|
|
||||||
# Add data rows
|
# Add data rows
|
||||||
for row in rows[1:]:
|
for row in rows[1:]:
|
||||||
# Make sure row has the same number of columns as header
|
# Make sure row has the same number of columns as header
|
||||||
while len(row) < len(rows[0]):
|
while len(row) < len(rows[0]):
|
||||||
row.append("")
|
row.append("")
|
||||||
# Truncate if row has more columns than header
|
# Truncate if row has more columns than header
|
||||||
row = row[:len(rows[0])]
|
row = row[: len(rows[0])]
|
||||||
markdown_table.append("| " + " | ".join(row) + " |")
|
markdown_table.append("| " + " | ".join(row) + " |")
|
||||||
|
|
||||||
result = "\n".join(markdown_table)
|
result = "\n".join(markdown_table)
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=result)
|
return DocumentConverterResult(markdown=result)
|
||||||
|
|
|
||||||
|
|
@ -24,14 +24,28 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
# Define these types for type hinting when the package is not available
|
# Define these types for type hinting when the package is not available
|
||||||
class AzureKeyCredential: pass
|
class AzureKeyCredential:
|
||||||
class TokenCredential: pass
|
pass
|
||||||
class DocumentIntelligenceClient: pass
|
|
||||||
class AnalyzeDocumentRequest: pass
|
class TokenCredential:
|
||||||
class AnalyzeResult: pass
|
pass
|
||||||
class DocumentAnalysisFeature: pass
|
|
||||||
class DefaultAzureCredential: pass
|
class DocumentIntelligenceClient:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class AnalyzeDocumentRequest:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class AnalyzeResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DocumentAnalysisFeature:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DefaultAzureCredential:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue