Added more fine-grained options over types.

This commit is contained in:
Adam Fourney 2025-03-24 14:02:55 -07:00
parent f8a974f8a1
commit d4f0e7f662
2 changed files with 104 additions and 34 deletions

View file

@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
from ._audio_converter import AudioConverter from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import (
DocumentIntelligenceConverter,
DocumentIntelligenceFileType,
)
from ._epub_converter import EpubConverter from ._epub_converter import EpubConverter
__all__ = [ __all__ = [
@ -38,5 +41,6 @@ __all__ = [
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",
"DocumentIntelligenceFileType",
"EpubConverter", "EpubConverter",
] ]

View file

@ -3,6 +3,7 @@ import re
import os import os
from typing import BinaryIO, Any, List from typing import BinaryIO, Any, List
from enum import Enum
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
@ -31,38 +32,74 @@ except ImportError:
CONTENT_FORMAT = "markdown" CONTENT_FORMAT = "markdown"
OFFICE_MIME_TYPE_PREFIXES = [ class DocumentIntelligenceFileType(str, Enum):
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", """Enum of file types supported by the Document Intelligence Converter."""
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml",
"application/xhtml",
"text/html",
]
OTHER_MIME_TYPE_PREFIXES = [ # No OCR
"application/pdf", DOCX = "docx"
"application/x-pdf", PPTX = "pptx"
"text/html", XLSX = "xlsx"
"image/", HTML = "html"
] # OCR
PDF = "pdf"
JPEG = "jpeg"
PNG = "png"
BMP = "bmp"
TIFF = "tiff"
OFFICE_FILE_EXTENSIONS = [
".docx",
".xlsx",
".pptx",
".html",
".htm",
]
OTHER_FILE_EXTENSIONS = [ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
".pdf", """Get the MIME type prefixes for the given file types."""
".jpeg", prefixes: List[str] = []
".jpg", for type_ in types:
".png", if type_ == DocumentIntelligenceFileType.DOCX:
".bmp", prefixes.append(
".tiff", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
".heif", )
] elif type_ == DocumentIntelligenceFileType.PPTX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.presentationml"
)
elif type_ == DocumentIntelligenceFileType.XLSX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
elif type_ == DocumentIntelligenceFileType.PDF:
prefixes.append("application/pdf")
prefixes.append("application/x-pdf")
elif type_ == DocumentIntelligenceFileType.JPEG:
prefixes.append("image/jpeg")
elif type_ == DocumentIntelligenceFileType.PNG:
prefixes.append("image/png")
elif type_ == DocumentIntelligenceFileType.BMP:
prefixes.append("image/bmp")
elif type_ == DocumentIntelligenceFileType.TIFF:
prefixes.append("image/tiff")
return prefixes
def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
"""Get the file extensions for the given file types."""
extensions: List[str] = []
for type_ in types:
if type_ == DocumentIntelligenceFileType.DOCX:
extensions.append(".docx")
elif type_ == DocumentIntelligenceFileType.PPTX:
extensions.append(".pptx")
elif type_ == DocumentIntelligenceFileType.XLSX:
extensions.append(".xlsx")
elif type_ == DocumentIntelligenceFileType.PDF:
extensions.append(".pdf")
elif type_ == DocumentIntelligenceFileType.JPEG:
extensions.append(".jpg")
extensions.append(".jpeg")
elif type_ == DocumentIntelligenceFileType.PNG:
extensions.append(".png")
elif type_ == DocumentIntelligenceFileType.BMP:
extensions.append(".bmp")
elif type_ == DocumentIntelligenceFileType.TIFF:
extensions.append(".tiff")
return extensions
class DocumentIntelligenceConverter(DocumentConverter): class DocumentIntelligenceConverter(DocumentConverter):
@ -74,8 +111,29 @@ class DocumentIntelligenceConverter(DocumentConverter):
endpoint: str, endpoint: str,
api_version: str = "2024-07-31-preview", api_version: str = "2024-07-31-preview",
credential: AzureKeyCredential | TokenCredential | None = None, credential: AzureKeyCredential | TokenCredential | None = None,
file_types: List[DocumentIntelligenceFileType] = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
DocumentIntelligenceFileType.PDF,
DocumentIntelligenceFileType.JPEG,
DocumentIntelligenceFileType.PNG,
DocumentIntelligenceFileType.BMP,
DocumentIntelligenceFileType.TIFF,
],
): ):
"""
Initialize the DocumentIntelligenceConverter.
Args:
endpoint (str): The endpoint for the Document Intelligence service.
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
"""
super().__init__() super().__init__()
self._file_types = file_types
# Raise an error if the dependencies are not available. # Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated # This is different than other converters since this one isn't even instantiated
@ -112,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS: if extension in _get_file_extensions(self._file_types):
return True return True
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES: for prefix in _get_mime_type_prefixes(self._file_types):
if mimetype.startswith(prefix): if mimetype.startswith(prefix):
return True return True
@ -130,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()
if extension in OFFICE_FILE_EXTENSIONS: # Types that don't support ocr
no_ocr_types = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
DocumentIntelligenceFileType.HTML,
]
if extension in _get_file_extensions(no_ocr_types):
return [] return []
for prefix in OFFICE_MIME_TYPE_PREFIXES: for prefix in _get_mime_type_prefixes(no_ocr_types):
if mimetype.startswith(prefix): if mimetype.startswith(prefix):
return [] return []