Merge branch 'main' into main

This commit is contained in:
afourney 2025-02-08 20:33:15 -08:00 committed by GitHub
commit 959ea53f96
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 22 additions and 1 deletions

View file

@ -33,12 +33,20 @@ Or use `-o` to specify the output file:
markitdown path-to-file.pdf -o document.md markitdown path-to-file.pdf -o document.md
``` ```
To use Document Intelligence conversion:
```bash
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
```
You can also pipe content: You can also pipe content:
```bash ```bash
cat path-to-file.pdf | markitdown cat path-to-file.pdf | markitdown
``` ```
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
### Python API ### Python API
Basic usage in Python: Basic usage in Python:
@ -51,6 +59,16 @@ result = md.convert("test.xlsx")
print(result.text_content) print(result.text_content)
``` ```
Document Intelligence conversion in Python:
```python
from markitdown import MarkItDown
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert("test.pdf")
print(result.text_content)
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python ```python

View file

@ -217,7 +217,7 @@ class HtmlConverter(DocumentConverter):
return result return result
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
"""Helper function that converts and HTML string.""" """Helper function that converts an HTML string."""
# Parse the string # Parse the string
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
@ -236,6 +236,9 @@ class HtmlConverter(DocumentConverter):
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)
# remove leading and trailing \n
webpage_text = webpage_text.strip()
return DocumentConverterResult( return DocumentConverterResult(
title=None if soup.title is None else soup.title.string, title=None if soup.title is None else soup.title.string,
text_content=webpage_text, text_content=webpage_text,