Added more fine-grained options over types.
This commit is contained in:
parent
f8a974f8a1
commit
d4f0e7f662
2 changed files with 104 additions and 34 deletions
|
|
@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
|
||||||
from ._audio_converter import AudioConverter
|
from ._audio_converter import AudioConverter
|
||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import (
|
||||||
|
DocumentIntelligenceConverter,
|
||||||
|
DocumentIntelligenceFileType,
|
||||||
|
)
|
||||||
from ._epub_converter import EpubConverter
|
from ._epub_converter import EpubConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
|
@ -38,5 +41,6 @@ __all__ = [
|
||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
"DocumentIntelligenceFileType",
|
||||||
"EpubConverter",
|
"EpubConverter",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from typing import BinaryIO, Any, List
|
from typing import BinaryIO, Any, List
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
|
@ -31,38 +32,74 @@ except ImportError:
|
||||||
CONTENT_FORMAT = "markdown"
|
CONTENT_FORMAT = "markdown"
|
||||||
|
|
||||||
|
|
||||||
OFFICE_MIME_TYPE_PREFIXES = [
|
class DocumentIntelligenceFileType(str, Enum):
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"""Enum of file types supported by the Document Intelligence Converter."""
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml",
|
|
||||||
"application/xhtml",
|
|
||||||
"text/html",
|
|
||||||
]
|
|
||||||
|
|
||||||
OTHER_MIME_TYPE_PREFIXES = [
|
# No OCR
|
||||||
"application/pdf",
|
DOCX = "docx"
|
||||||
"application/x-pdf",
|
PPTX = "pptx"
|
||||||
"text/html",
|
XLSX = "xlsx"
|
||||||
"image/",
|
HTML = "html"
|
||||||
]
|
# OCR
|
||||||
|
PDF = "pdf"
|
||||||
|
JPEG = "jpeg"
|
||||||
|
PNG = "png"
|
||||||
|
BMP = "bmp"
|
||||||
|
TIFF = "tiff"
|
||||||
|
|
||||||
OFFICE_FILE_EXTENSIONS = [
|
|
||||||
".docx",
|
|
||||||
".xlsx",
|
|
||||||
".pptx",
|
|
||||||
".html",
|
|
||||||
".htm",
|
|
||||||
]
|
|
||||||
|
|
||||||
OTHER_FILE_EXTENSIONS = [
|
def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||||
".pdf",
|
"""Get the MIME type prefixes for the given file types."""
|
||||||
".jpeg",
|
prefixes: List[str] = []
|
||||||
".jpg",
|
for type_ in types:
|
||||||
".png",
|
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||||
".bmp",
|
prefixes.append(
|
||||||
".tiff",
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
".heif",
|
)
|
||||||
]
|
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||||
|
prefixes.append(
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml"
|
||||||
|
)
|
||||||
|
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||||
|
prefixes.append(
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
)
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||||
|
prefixes.append("application/pdf")
|
||||||
|
prefixes.append("application/x-pdf")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||||
|
prefixes.append("image/jpeg")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||||
|
prefixes.append("image/png")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||||
|
prefixes.append("image/bmp")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||||
|
prefixes.append("image/tiff")
|
||||||
|
return prefixes
|
||||||
|
|
||||||
|
|
||||||
|
def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||||
|
"""Get the file extensions for the given file types."""
|
||||||
|
extensions: List[str] = []
|
||||||
|
for type_ in types:
|
||||||
|
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||||
|
extensions.append(".docx")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||||
|
extensions.append(".pptx")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||||
|
extensions.append(".xlsx")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||||
|
extensions.append(".pdf")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||||
|
extensions.append(".jpg")
|
||||||
|
extensions.append(".jpeg")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||||
|
extensions.append(".png")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||||
|
extensions.append(".bmp")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||||
|
extensions.append(".tiff")
|
||||||
|
return extensions
|
||||||
|
|
||||||
|
|
||||||
class DocumentIntelligenceConverter(DocumentConverter):
|
class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
|
|
@ -74,8 +111,29 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
endpoint: str,
|
endpoint: str,
|
||||||
api_version: str = "2024-07-31-preview",
|
api_version: str = "2024-07-31-preview",
|
||||||
credential: AzureKeyCredential | TokenCredential | None = None,
|
credential: AzureKeyCredential | TokenCredential | None = None,
|
||||||
|
file_types: List[DocumentIntelligenceFileType] = [
|
||||||
|
DocumentIntelligenceFileType.DOCX,
|
||||||
|
DocumentIntelligenceFileType.PPTX,
|
||||||
|
DocumentIntelligenceFileType.XLSX,
|
||||||
|
DocumentIntelligenceFileType.PDF,
|
||||||
|
DocumentIntelligenceFileType.JPEG,
|
||||||
|
DocumentIntelligenceFileType.PNG,
|
||||||
|
DocumentIntelligenceFileType.BMP,
|
||||||
|
DocumentIntelligenceFileType.TIFF,
|
||||||
|
],
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Initialize the DocumentIntelligenceConverter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
endpoint (str): The endpoint for the Document Intelligence service.
|
||||||
|
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
|
||||||
|
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
|
||||||
|
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
|
||||||
|
"""
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self._file_types = file_types
|
||||||
|
|
||||||
# Raise an error if the dependencies are not available.
|
# Raise an error if the dependencies are not available.
|
||||||
# This is different than other converters since this one isn't even instantiated
|
# This is different than other converters since this one isn't even instantiated
|
||||||
|
|
@ -112,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
|
if extension in _get_file_extensions(self._file_types):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
|
for prefix in _get_mime_type_prefixes(self._file_types):
|
||||||
if mimetype.startswith(prefix):
|
if mimetype.startswith(prefix):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
@ -130,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
if extension in OFFICE_FILE_EXTENSIONS:
|
# Types that don't support ocr
|
||||||
|
no_ocr_types = [
|
||||||
|
DocumentIntelligenceFileType.DOCX,
|
||||||
|
DocumentIntelligenceFileType.PPTX,
|
||||||
|
DocumentIntelligenceFileType.XLSX,
|
||||||
|
DocumentIntelligenceFileType.HTML,
|
||||||
|
]
|
||||||
|
|
||||||
|
if extension in _get_file_extensions(no_ocr_types):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for prefix in OFFICE_MIME_TYPE_PREFIXES:
|
for prefix in _get_mime_type_prefixes(no_ocr_types):
|
||||||
if mimetype.startswith(prefix):
|
if mimetype.startswith(prefix):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue