From bf6a15e9b5eb89820bf82c04cbe934bf62fb8617 Mon Sep 17 00:00:00 2001 From: KennyZhang1 <90438893+KennyZhang1@users.noreply.github.com> Date: Sat, 1 Feb 2025 01:23:26 -0500 Subject: [PATCH 1/3] Kennyzhang/docintel docs (#312) * updated docs to include doc intelligence * include reference to doc intel setup docs --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 6bc91e6..76a4d3f 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,20 @@ Or use `-o` to specify the output file: markitdown path-to-file.pdf -o document.md ``` +To use Document Intelligence conversion: + +```bash +markitdown path-to-file.pdf -o document.md -d -e "" +``` + You can also pipe content: ```bash cat path-to-file.pdf | markitdown ``` +More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0) + ### Python API Basic usage in Python: @@ -51,6 +59,16 @@ result = md.convert("test.xlsx") print(result.text_content) ``` +Document Intelligence conversion in Python: + +```python +from markitdown import MarkItDown + +md = MarkItDown(docintel_endpoint="") +result = md.convert("test.pdf") +print(result.text_content) +``` + To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: ```python From 7bea2672a05f5877acb8690b20222593dab13788 Mon Sep 17 00:00:00 2001 From: ZeyuTeng96 <96521059+ZeyuTeng96@users.noreply.github.com> Date: Sun, 9 Feb 2025 12:28:35 +0800 Subject: [PATCH 2/3] remove leading and trailing \n for HtmlConverter (#262) --- src/markitdown/_markitdown.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index ae6a7b4..6f40547 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -236,6 +236,9 @@ class HtmlConverter(DocumentConverter): assert isinstance(webpage_text, str) + # remove leading and trailing \n + webpage_text = webpage_text.strip() + return DocumentConverterResult( title=None if soup.title is None else soup.title.string, text_content=webpage_text, From 3090917a49dc8ec94682c47747f3e2692e3953ae Mon Sep 17 00:00:00 2001 From: James Hickey Date: Sun, 9 Feb 2025 00:30:13 -0400 Subject: [PATCH 3/3] Typo fixed (#270) --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 6f40547..e4884ec 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -217,7 +217,7 @@ class HtmlConverter(DocumentConverter): return result def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: - """Helper function that converts and HTML string.""" + """Helper function that converts an HTML string.""" # Parse the string soup = BeautifulSoup(html_content, "html.parser")