Merge a77c4f0415 into 2a2ccc86aa
This commit is contained in:
commit
dba32ef21d
3 changed files with 100 additions and 97 deletions
|
|
@ -146,6 +146,13 @@ result = md.convert("example.jpg")
|
|||
print(result.text_content)
|
||||
```
|
||||
|
||||
Or from the CLI:
|
||||
|
||||
```bash
|
||||
pip install markitdown
|
||||
markitdown --llm-model gpt-4o example.jpg
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
```sh
|
||||
|
|
|
|||
|
|
@ -8,101 +8,89 @@ from importlib.metadata import entry_points
|
|||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown, DocumentConverterResult
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert various file formats to markdown.",
|
||||
prog="markitdown",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
usage=dedent(
|
||||
"""
|
||||
SYNTAX:
|
||||
|
||||
markitdown <OPTIONAL: FILENAME>
|
||||
If FILENAME is empty, markitdown reads from stdin.
|
||||
|
||||
EXAMPLE:
|
||||
|
||||
epilog=dedent(
|
||||
"""\
|
||||
examples:
|
||||
markitdown example.pdf
|
||||
|
||||
OR
|
||||
|
||||
cat example.pdf | markitdown
|
||||
|
||||
OR
|
||||
|
||||
markitdown < example.pdf
|
||||
|
||||
OR to save to a file use
|
||||
|
||||
markitdown example.pdf -o example.md
|
||||
|
||||
OR
|
||||
|
||||
markitdown example.pdf > example.md
|
||||
"""
|
||||
).strip(),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
markitdown -o example.md example.pdf
|
||||
cat example.pdf | markitdown > example.md"""
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
action="version",
|
||||
version=f"%(prog)s {__version__}",
|
||||
help="show the version number and exit",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="Output file name. If not provided, output is written to stdout.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
metavar="OUTFILENAME",
|
||||
help="if unspecified, defaults to stdout",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--use-docintel",
|
||||
action="store_true",
|
||||
help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
help="use online Document Intelligence to extract text (requires a valid `--endpoint`)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--endpoint",
|
||||
type=str,
|
||||
help="Document Intelligence Endpoint. Required if using Document Intelligence.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
help="required for `--use-docintel`",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--use-plugins",
|
||||
action="store_true",
|
||||
help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
help="use 3rd-party plugins to convert files (see `--list-plugins`)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-plugins",
|
||||
action="store_true",
|
||||
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
|
||||
)
|
||||
help="list installed 3rd-party plugins (loaded with `--use-plugin`)",
|
||||
)
|
||||
parser.add_argument("--llm-model", metavar="MODEL", help="e.g. gpt-4o")
|
||||
parser.add_argument(
|
||||
"--llm-client-url", metavar="URL", help="base URL for OpenAI LLM client"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-H",
|
||||
"--llm-client-header",
|
||||
metavar="HEADER",
|
||||
nargs="*",
|
||||
default=[],
|
||||
help="may be specified multiple times",
|
||||
)
|
||||
parser.add_argument(
|
||||
"filename", metavar="FILENAME", nargs="?", help="if unspecified, defaults to stdin"
|
||||
)
|
||||
|
||||
parser.add_argument("filename", nargs="?")
|
||||
args = parser.parse_args()
|
||||
|
||||
def main(args=None):
|
||||
args = parser.parse_args(args)
|
||||
|
||||
if args.list_plugins:
|
||||
# List installed plugins, then exit
|
||||
print("Installed MarkItDown 3rd-party Plugins:\n")
|
||||
plugin_entry_points = list(entry_points(group="markitdown.plugin"))
|
||||
if len(plugin_entry_points) == 0:
|
||||
print(" * No 3rd-party plugins installed.")
|
||||
print(
|
||||
"\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n"
|
||||
)
|
||||
else:
|
||||
if plugin_entry_points:
|
||||
for entry_point in plugin_entry_points:
|
||||
print(f" * {entry_point.name:<16}\t(package: {entry_point.value})")
|
||||
print(
|
||||
"\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n"
|
||||
)
|
||||
else:
|
||||
print("No 3rd-party plugins installed.")
|
||||
print(
|
||||
"\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n"
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
if args.use_docintel:
|
||||
|
|
@ -112,25 +100,33 @@ def main():
|
|||
)
|
||||
elif args.filename is None:
|
||||
raise ValueError("Filename is required when using Document Intelligence.")
|
||||
|
||||
if args.llm_model:
|
||||
from openai import OpenAI
|
||||
|
||||
headers = {}
|
||||
for header in args.llm_client_header:
|
||||
key, value = header.split(":", 1)
|
||||
headers[key] = value.lstrip()
|
||||
llm_client = OpenAI(base_url=args.llm_client_url, default_headers=headers)
|
||||
else:
|
||||
llm_client = None
|
||||
|
||||
markitdown = MarkItDown(
|
||||
enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
|
||||
enable_plugins=args.use_plugins,
|
||||
docintel_endpoint=args.endpoint if args.use_docintel else None,
|
||||
llm_client=llm_client,
|
||||
llm_model=args.llm_model,
|
||||
)
|
||||
else:
|
||||
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
||||
|
||||
if args.filename is None:
|
||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||
else:
|
||||
if args.filename:
|
||||
result = markitdown.convert(args.filename)
|
||||
else:
|
||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||
|
||||
_handle_output(args, result)
|
||||
|
||||
|
||||
def _handle_output(args, result: DocumentConverterResult):
|
||||
"""Handle output to stdout or file"""
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(result.text_content)
|
||||
print(result.text_content, file=f)
|
||||
else:
|
||||
print(result.text_content)
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ def test_invalid_flag(shared_tmp_dir) -> None:
|
|||
assert (
|
||||
"unrecognized arguments" in result.stderr
|
||||
), f"Expected 'unrecognized arguments' to appear in STDERR"
|
||||
assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR"
|
||||
assert "usage" in result.stderr, f"Expected 'usage' to appear in STDERR"
|
||||
|
||||
|
||||
def test_output_to_stdout(shared_tmp_dir) -> None:
|
||||
|
|
|
|||
Loading…
Reference in a new issue