Merge branch 'main' into main
This commit is contained in:
commit
959ea53f96
2 changed files with 22 additions and 1 deletions
18
README.md
18
README.md
|
|
@ -33,12 +33,20 @@ Or use `-o` to specify the output file:
|
||||||
markitdown path-to-file.pdf -o document.md
|
markitdown path-to-file.pdf -o document.md
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To use Document Intelligence conversion:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
|
||||||
|
```
|
||||||
|
|
||||||
You can also pipe content:
|
You can also pipe content:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cat path-to-file.pdf | markitdown
|
cat path-to-file.pdf | markitdown
|
||||||
```
|
```
|
||||||
|
|
||||||
|
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
|
||||||
|
|
||||||
### Python API
|
### Python API
|
||||||
|
|
||||||
Basic usage in Python:
|
Basic usage in Python:
|
||||||
|
|
@ -51,6 +59,16 @@ result = md.convert("test.xlsx")
|
||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Document Intelligence conversion in Python:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
|
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
|
||||||
|
result = md.convert("test.pdf")
|
||||||
|
print(result.text_content)
|
||||||
|
```
|
||||||
|
|
||||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
||||||
|
|
@ -217,7 +217,7 @@ class HtmlConverter(DocumentConverter):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
|
||||||
"""Helper function that converts and HTML string."""
|
"""Helper function that converts an HTML string."""
|
||||||
|
|
||||||
# Parse the string
|
# Parse the string
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
|
@ -236,6 +236,9 @@ class HtmlConverter(DocumentConverter):
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
|
# remove leading and trailing \n
|
||||||
|
webpage_text = webpage_text.strip()
|
||||||
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
text_content=webpage_text,
|
text_content=webpage_text,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue