From 06080eb2e85a5b1b4886b5d32a71b26b3fe3e92d Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Thu, 9 Jan 2025 14:41:14 -0500 Subject: [PATCH] added DocumentIntelligenceConverter class implementation --- src/markitdown/__main__.py | 6 ++-- src/markitdown/_markitdown.py | 59 +++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 45652f1..5bc9ede 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -4,8 +4,8 @@ import argparse import sys from textwrap import dedent -from .__about__ import __version__ -from ._markitdown import MarkItDown, DocumentConverterResult +from __about__ import __version__ +from _markitdown import MarkItDown, DocumentConverterResult def main(): @@ -76,7 +76,7 @@ def main(): raise ValueError("Document Intelligence Endpoint is required when using Document Intelligence.") elif args.filename is None: raise ValueError("Filename is required when using Document Intelligence.") - markitdown = MarkItDown(endpoint=args.endpoint) + markitdown = MarkItDown(docintel_endpoint=args.endpoint) else: markitdown = MarkItDown() diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e1..00052ba 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -33,6 +33,16 @@ import requests from bs4 import BeautifulSoup from charset_normalizer import from_path +# Azure imports +from azure.ai.documentintelligence import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import ( + AnalyzeDocumentRequest, + AnalyzeResult, + ContentFormat, + DocumentAnalysisFeature, +) +from azure.identity import DefaultAzureCredential + # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: @@ -157,7 +167,54 @@ class DocumentConverter: self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: raise NotImplementedError() + +class DocumentIntelligenceConverter(DocumentConverter): + """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" + def __init__( + self, + endpoint: str, + api_version: str = "2024-07-31-preview", + ): + self.endpoint = endpoint + self.api_version = api_version + self.doc_intel_client = DocumentIntelligenceClient( + endpoint=self.endpoint, api_version=self.api_version, credential=DefaultAzureCredential() + ) + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + extension = kwargs.get("file_extension", "") + + # Get the bytestring for the local path + file_bytes = open(local_path, "rb").read() + + # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) + if extension.lower() in [".xlsx", ".pptx", ".html"]: + analysis_features = [] + else: + analysis_features = [ + DocumentAnalysisFeature.FORMULAS, # enable formula extraction + DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR + DocumentAnalysisFeature.STYLE_FONT # enable font style extraction + ] + + # Extract the text using Azure Document Intelligence + poller = self.doc_intel_client.begin_analyze_document( + model_id="prebuilt-layout", + analyze_request=AnalyzeDocumentRequest(bytes_source=file_bytes), + features=analysis_features, + output_content_format=ContentFormat.MARKDOWN, + ) + result: AnalyzeResult = poller.result() + + # remove comments from the markdown content generated by Doc Intelligence and append to markdown string + markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) + return DocumentConverterResult( + title=None, + text_content=markdown_text, + ) class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" @@ -1337,6 +1394,7 @@ class MarkItDown: llm_model: Optional[str] = None, style_map: Optional[str] = None, exiftool_path: Optional[str] = None, + docintel_endpoint: Optional[str] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1382,6 +1440,7 @@ class MarkItDown: self._llm_model = llm_model self._style_map = style_map self._exiftool_path = exiftool_path + self._docintel_endpoint = docintel_endpoint self._page_converters: List[DocumentConverter] = []