diff --git a/README.md b/README.md index 6bc91e6..76a4d3f 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,20 @@ Or use `-o` to specify the output file: markitdown path-to-file.pdf -o document.md ``` +To use Document Intelligence conversion: + +```bash +markitdown path-to-file.pdf -o document.md -d -e "" +``` + You can also pipe content: ```bash cat path-to-file.pdf | markitdown ``` +More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0) + ### Python API Basic usage in Python: @@ -51,6 +59,16 @@ result = md.convert("test.xlsx") print(result.text_content) ``` +Document Intelligence conversion in Python: + +```python +from markitdown import MarkItDown + +md = MarkItDown(docintel_endpoint="") +result = md.convert("test.pdf") +print(result.text_content) +``` + To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: ```python diff --git a/pyproject.toml b/pyproject.toml index 9c113ad..2a4e203 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,8 @@ dependencies = [ "pathvalidate", "charset-normalizer", "openai", + "azure-ai-documentintelligence", + "azure-identity" ] [project.urls] diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 8bfa718..5ebf5f8 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -4,8 +4,8 @@ import argparse import sys from textwrap import dedent -from .__about__ import __version__ -from ._markitdown import MarkItDown, DocumentConverterResult +from __about__ import __version__ +from _markitdown import MarkItDown, DocumentConverterResult def main(): @@ -57,17 +57,40 @@ def main(): help="Output file name. If not provided, output is written to stdout.", ) + parser.add_argument( + "-d", + "--use-docintel", + action="store_true", + help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", + ) + + parser.add_argument( + "-e", + "--endpoint", + type=str, + help="Document Intelligence Endpoint. Required if using Document Intelligence.", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() - if args.filename is None: - markitdown = MarkItDown() - result = markitdown.convert_stream(sys.stdin.buffer) - _handle_output(args, result) + if args.use_docintel: + if args.endpoint is None: + raise ValueError( + "Document Intelligence Endpoint is required when using Document Intelligence." + ) + elif args.filename is None: + raise ValueError("Filename is required when using Document Intelligence.") + markitdown = MarkItDown(docintel_endpoint=args.endpoint) else: markitdown = MarkItDown() + + if args.filename is None: + result = markitdown.convert_stream(sys.stdin.buffer) + else: result = markitdown.convert(args.filename) - _handle_output(args, result) + + _handle_output(args, result) def _handle_output(args, result: DocumentConverterResult): diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e1..9f610f6 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -33,6 +33,19 @@ import requests from bs4 import BeautifulSoup from charset_normalizer import from_path +# Azure imports +from azure.ai.documentintelligence import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import ( + AnalyzeDocumentRequest, + AnalyzeResult, + DocumentAnalysisFeature, +) +from azure.identity import DefaultAzureCredential + +# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. +# This constant is a temporary fix until the bug is resolved. +CONTENT_FORMAT = "markdown" + # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: @@ -204,7 +217,7 @@ class HtmlConverter(DocumentConverter): return result def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: - """Helper function that converts and HTML string.""" + """Helper function that converts an HTML string.""" # Parse the string soup = BeautifulSoup(html_content, "html.parser") @@ -223,6 +236,9 @@ class HtmlConverter(DocumentConverter): assert isinstance(webpage_text, str) + # remove leading and trailing \n + webpage_text = webpage_text.strip() + return DocumentConverterResult( title=None if soup.title is None else soup.title.string, text_content=webpage_text, @@ -771,6 +787,35 @@ class PptxConverter(HtmlConverter): Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ + def _get_llm_description( + self, llm_client, llm_model, image_blob, content_type, prompt=None + ): + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed alt text for this image with less than 50 words." + + image_base64 = base64.b64encode(image_blob).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + {"type": "text", "text": prompt}, + ], + } + ] + + response = llm_client.chat.completions.create( + model=llm_model, messages=messages + ) + return response.choices[0].message.content + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PPTX extension = kwargs.get("file_extension", "") @@ -791,17 +836,38 @@ class PptxConverter(HtmlConverter): # Pictures if self._is_picture(shape): # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 - alt_text = "" - try: - alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") - except Exception: - pass + + llm_description = None + alt_text = None + + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: + try: + llm_description = self._get_llm_description( + llm_client, + llm_model, + shape.image.blob, + shape.image.content_type, + ) + except Exception: + # Unable to describe with LLM + pass + + if not llm_description: + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get( + "descr", "" + ) + except Exception: + # Unable to get alt text + pass # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" md_content += ( "\n![" - + (alt_text if alt_text else shape.name) + + (llm_description or alt_text or shape.name) + "](" + filename + ")\n" @@ -1318,6 +1384,74 @@ class ZipConverter(DocumentConverter): ) +class DocumentIntelligenceConverter(DocumentConverter): + """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" + + def __init__( + self, + endpoint: str, + api_version: str = "2024-07-31-preview", + ): + self.endpoint = endpoint + self.api_version = api_version + self.doc_intel_client = DocumentIntelligenceClient( + endpoint=self.endpoint, + api_version=self.api_version, + credential=DefaultAzureCredential(), + ) + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if extension is not supported by Document Intelligence + extension = kwargs.get("file_extension", "") + docintel_extensions = [ + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".html", + ".jpeg", + ".jpg", + ".png", + ".bmp", + ".tiff", + ".heif", + ] + if extension.lower() not in docintel_extensions: + return None + + # Get the bytestring for the local path + with open(local_path, "rb") as f: + file_bytes = f.read() + + # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) + if extension.lower() in [".xlsx", ".pptx", ".html"]: + analysis_features = [] + else: + analysis_features = [ + DocumentAnalysisFeature.FORMULAS, # enable formula extraction + DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR + DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction + ] + + # Extract the text using Azure Document Intelligence + poller = self.doc_intel_client.begin_analyze_document( + model_id="prebuilt-layout", + body=AnalyzeDocumentRequest(bytes_source=file_bytes), + features=analysis_features, + output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed + ) + result: AnalyzeResult = poller.result() + + # remove comments from the markdown content generated by Doc Intelligence and append to markdown string + markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) + return DocumentConverterResult( + title=None, + text_content=markdown_text, + ) + + class FileConversionException(BaseException): pass @@ -1337,6 +1471,7 @@ class MarkItDown: llm_model: Optional[str] = None, style_map: Optional[str] = None, exiftool_path: Optional[str] = None, + docintel_endpoint: Optional[str] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1406,6 +1541,12 @@ class MarkItDown: self.register_page_converter(ZipConverter()) self.register_page_converter(OutlookMsgConverter()) + # Register Document Intelligence converter at the top of the stack if endpoint is provided + if docintel_endpoint is not None: + self.register_page_converter( + DocumentIntelligenceConverter(endpoint=docintel_endpoint) + ) + def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs