From c0c0533a5edd29678ecbeedeb82385e9e2b4beef Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Tue, 17 Dec 2024 05:51:30 +0000 Subject: [PATCH 1/9] slight tidy - follow-up to #46 --- .../markitdown/src/markitdown/__main__.py | 55 +++++-------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6a24391..a74320a 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -14,36 +14,14 @@ def main(): description="Convert various file formats to markdown.", prog="markitdown", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=dedent( - """ - SYNTAX: - - markitdown - If FILENAME is empty, markitdown reads from stdin. - - EXAMPLE: - - markitdown example.pdf - - OR - - cat example.pdf | markitdown - - OR - - markitdown < example.pdf - - OR to save to a file use - - markitdown example.pdf -o example.md - - OR - - markitdown example.pdf > example.md - """ - ).strip(), + epilog=dedent( + """\ + examples: + markitdown example.pdf + markitdown -o example.md example.pdf + cat example.pdf | markitdown > example.md""" + ), ) - parser.add_argument( "-v", "--version", @@ -51,41 +29,39 @@ def main(): version=f"%(prog)s {__version__}", help="show the version number and exit", ) - parser.add_argument( "-o", "--output", - help="Output file name. If not provided, output is written to stdout.", + dest="filename", + help="if unspecified, defaults to stdout", ) - parser.add_argument( "-d", "--use-docintel", action="store_true", help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", ) - parser.add_argument( "-e", "--endpoint", type=str, help="Document Intelligence Endpoint. Required if using Document Intelligence.", ) - parser.add_argument( "-p", "--use-plugins", action="store_true", help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", ) - parser.add_argument( "--list-plugins", action="store_true", help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", ) + parser.add_argument( + "filename", nargs="?", help="if unspecified, defaults to stdin" + ) - parser.add_argument("filename", nargs="?") args = parser.parse_args() if args.list_plugins: @@ -123,14 +99,9 @@ def main(): else: result = markitdown.convert(args.filename) - _handle_output(args, result) - - -def _handle_output(args, result: DocumentConverterResult): - """Handle output to stdout or file""" if args.output: with open(args.output, "w", encoding="utf-8") as f: - f.write(result.text_content) + print(result.text_content, file=f) else: print(result.text_content) From b0406ca2c798f485103dfb225fddc453a91322b1 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sun, 22 Dec 2024 08:36:55 +0000 Subject: [PATCH 2/9] global parser makes external integration with e.g. `shtab` easy --- .../markitdown/src/markitdown/__main__.py | 124 +++++++++--------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index a74320a..60afc3e 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -8,77 +8,77 @@ from importlib.metadata import entry_points from .__about__ import __version__ from ._markitdown import MarkItDown, DocumentConverterResult +parser = argparse.ArgumentParser( + description="Convert various file formats to markdown.", + prog="markitdown", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=dedent( + """\ + examples: + markitdown example.pdf + markitdown -o example.md example.pdf + cat example.pdf | markitdown > example.md""" + ), +) +parser.add_argument( + "-v", + "--version", + action="version", + version=f"%(prog)s {__version__}", + help="show the version number and exit", +) +parser.add_argument( + "-o", + "--output", + metavar="outfilename", + help="if unspecified, defaults to stdout", +) +parser.add_argument( + "-d", + "--use-docintel", + action="store_true", + help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", +) +parser.add_argument( + "-e", + "--endpoint", + type=str, + help="Document Intelligence Endpoint. Required if using Document Intelligence.", +) +parser.add_argument( + "-p", + "--use-plugins", + action="store_true", + help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", +) +parser.add_argument( + "--list-plugins", + action="store_true", + help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", +) +parser.add_argument( + "filename", nargs="?", help="if unspecified, defaults to stdin" +) -def main(): - parser = argparse.ArgumentParser( - description="Convert various file formats to markdown.", - prog="markitdown", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=dedent( - """\ - examples: - markitdown example.pdf - markitdown -o example.md example.pdf - cat example.pdf | markitdown > example.md""" - ), - ) - parser.add_argument( - "-v", - "--version", - action="version", - version=f"%(prog)s {__version__}", - help="show the version number and exit", - ) - parser.add_argument( - "-o", - "--output", - dest="filename", - help="if unspecified, defaults to stdout", - ) - parser.add_argument( - "-d", - "--use-docintel", - action="store_true", - help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", - ) - parser.add_argument( - "-e", - "--endpoint", - type=str, - help="Document Intelligence Endpoint. Required if using Document Intelligence.", - ) - parser.add_argument( - "-p", - "--use-plugins", - action="store_true", - help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", - ) - parser.add_argument( - "--list-plugins", - action="store_true", - help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", - ) - parser.add_argument( - "filename", nargs="?", help="if unspecified, defaults to stdin" - ) - args = parser.parse_args() +def main(args=None): + args = parser.parse_args(args) if args.list_plugins: # List installed plugins, then exit print("Installed MarkItDown 3rd-party Plugins:\n") plugin_entry_points = list(entry_points(group="markitdown.plugin")) - if len(plugin_entry_points) == 0: - print(" * No 3rd-party plugins installed.") - print( - "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n" - ) - else: + if plugin_entry_points: for entry_point in plugin_entry_points: print(f" * {entry_point.name:<16}\t(package: {entry_point.value})") print( "\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n" ) + else: + print("No 3rd-party plugins installed.") + print( + "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n" + ) sys.exit(0) if args.use_docintel: @@ -94,10 +94,10 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) - if args.filename is None: - result = markitdown.convert_stream(sys.stdin.buffer) - else: + if args.filename: result = markitdown.convert(args.filename) + else: + result = markitdown.convert_stream(sys.stdin.buffer) if args.output: with open(args.output, "w", encoding="utf-8") as f: From 38feb5e7a1a37d3ca25fdc5fd34fcfc5d43fa4db Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Thu, 6 Mar 2025 14:07:01 +0000 Subject: [PATCH 3/9] tidy docstrings --- packages/markitdown/src/markitdown/__main__.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 60afc3e..a91e1fb 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -37,28 +37,26 @@ parser.add_argument( "-d", "--use-docintel", action="store_true", - help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", + help="use online Document Intelligence to extract text (requires a valid `--endpoint`)", ) parser.add_argument( "-e", "--endpoint", type=str, - help="Document Intelligence Endpoint. Required if using Document Intelligence.", + help="required for `--use-docintel`", ) parser.add_argument( "-p", "--use-plugins", action="store_true", - help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", + help="use 3rd-party plugins to convert files (see `--list-plugins`)", ) parser.add_argument( "--list-plugins", action="store_true", - help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", -) -parser.add_argument( - "filename", nargs="?", help="if unspecified, defaults to stdin" + help="list installed 3rd-party plugins (loaded with `--use-plugin`)", ) +parser.add_argument("filename", nargs="?", help="if unspecified, defaults to stdin") def main(args=None): From e270e63bbcccd4ebd82e7dcd2f87f56df90931ad Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Thu, 6 Mar 2025 14:22:16 +0000 Subject: [PATCH 4/9] standardise metavars --- packages/markitdown/src/markitdown/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index a91e1fb..b048c88 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -30,7 +30,7 @@ parser.add_argument( parser.add_argument( "-o", "--output", - metavar="outfilename", + metavar="OUTFILENAME", help="if unspecified, defaults to stdout", ) parser.add_argument( @@ -56,7 +56,9 @@ parser.add_argument( action="store_true", help="list installed 3rd-party plugins (loaded with `--use-plugin`)", ) -parser.add_argument("filename", nargs="?", help="if unspecified, defaults to stdin") +parser.add_argument( + "filename", metavar="FILENAME", nargs="?", help="if unspecified, defaults to stdin" +) def main(args=None): From 3ebe8dfacb705866f27e2c45cb7826a24628a2e0 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Thu, 6 Mar 2025 14:22:29 +0000 Subject: [PATCH 5/9] slight if-else tidy --- packages/markitdown/src/markitdown/__main__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index b048c88..ed43289 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -88,11 +88,11 @@ def main(args=None): ) elif args.filename is None: raise ValueError("Filename is required when using Document Intelligence.") - markitdown = MarkItDown( - enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint - ) - else: - markitdown = MarkItDown(enable_plugins=args.use_plugins) + + markitdown = MarkItDown( + enable_plugins=args.use_plugins, + docintel_endpoint=args.endpoint if args.use_docintel else None, + ) if args.filename: result = markitdown.convert(args.filename) From 011328920b63ed169d986f1c2083a2d28c1f5bd6 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 8 Mar 2025 16:12:11 +0000 Subject: [PATCH 6/9] update tests --- packages/markitdown/tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/tests/test_cli.py b/packages/markitdown/tests/test_cli.py index 7c8afc2..02004af 100644 --- a/packages/markitdown/tests/test_cli.py +++ b/packages/markitdown/tests/test_cli.py @@ -33,7 +33,7 @@ def test_invalid_flag(shared_tmp_dir) -> None: assert ( "unrecognized arguments" in result.stderr ), f"Expected 'unrecognized arguments' to appear in STDERR" - assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR" + assert "usage" in result.stderr, f"Expected 'usage' to appear in STDERR" def test_output_to_stdout(shared_tmp_dir) -> None: From 88961c3280f98098b3e6fb5b26b0c51b88a31fca Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Tue, 17 Dec 2024 06:29:51 +0000 Subject: [PATCH 7/9] CLI: support LLM --- README.md | 7 +++++++ packages/markitdown/src/markitdown/__main__.py | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/README.md b/README.md index 40f4b82..0597e77 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,13 @@ result = md.convert("example.jpg") print(result.text_content) ``` +Or from the CLI: + +```bash +pip install markitdown +markitdown --llm-client OpenAI --llm-model gpt-4o example.jpg +``` + ### Docker ```sh diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ed43289..1436fb8 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -56,6 +56,9 @@ parser.add_argument( action="store_true", help="list installed 3rd-party plugins (loaded with `--use-plugin`)", ) +parser.add_argument("--llm-client", choices={"OpenAI"}, help="default None") +parser.add_argument("--llm-client-url", help="base URL for --llm-client") +parser.add_argument("--llm-model", help="required for --llm-client") parser.add_argument( "filename", metavar="FILENAME", nargs="?", help="if unspecified, defaults to stdin" ) @@ -89,9 +92,17 @@ def main(args=None): elif args.filename is None: raise ValueError("Filename is required when using Document Intelligence.") + if args.llm_client == "OpenAI": + from openai import OpenAI + llm_client = OpenAI(base_url=args.llm_client_url) + else: + llm_client = None + markitdown = MarkItDown( enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint if args.use_docintel else None, + llm_client=llm_client, + llm_model=args.llm_model, ) if args.filename: From 68724917c76e534fed0d71e3e404493f96fdecc6 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Tue, 17 Dec 2024 08:43:26 +0000 Subject: [PATCH 8/9] drop --llm-client for now --- README.md | 2 +- packages/markitdown/src/markitdown/__main__.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0597e77..02153fc 100644 --- a/README.md +++ b/README.md @@ -150,7 +150,7 @@ Or from the CLI: ```bash pip install markitdown -markitdown --llm-client OpenAI --llm-model gpt-4o example.jpg +markitdown --llm-model gpt-4o example.jpg ``` ### Docker diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 1436fb8..d267766 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -56,9 +56,10 @@ parser.add_argument( action="store_true", help="list installed 3rd-party plugins (loaded with `--use-plugin`)", ) -parser.add_argument("--llm-client", choices={"OpenAI"}, help="default None") -parser.add_argument("--llm-client-url", help="base URL for --llm-client") -parser.add_argument("--llm-model", help="required for --llm-client") +parser.add_argument("--llm-model", metavar="MODEL", help="e.g. gpt-4o") +parser.add_argument( + "--llm-client-url", metavar="URL", help="base URL for OpenAI LLM client" +) parser.add_argument( "filename", metavar="FILENAME", nargs="?", help="if unspecified, defaults to stdin" ) @@ -92,8 +93,9 @@ def main(args=None): elif args.filename is None: raise ValueError("Filename is required when using Document Intelligence.") - if args.llm_client == "OpenAI": + if args.llm_model: from openai import OpenAI + llm_client = OpenAI(base_url=args.llm_client_url) else: llm_client = None From a77c4f04154200f885fccef577a3341fbb2a2338 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Wed, 18 Dec 2024 13:29:53 +0000 Subject: [PATCH 9/9] CLI: add --llm-client-header --- packages/markitdown/src/markitdown/__main__.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index d267766..7861857 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -60,6 +60,14 @@ parser.add_argument("--llm-model", metavar="MODEL", help="e.g. gpt-4o") parser.add_argument( "--llm-client-url", metavar="URL", help="base URL for OpenAI LLM client" ) +parser.add_argument( + "-H", + "--llm-client-header", + metavar="HEADER", + nargs="*", + default=[], + help="may be specified multiple times", +) parser.add_argument( "filename", metavar="FILENAME", nargs="?", help="if unspecified, defaults to stdin" ) @@ -96,7 +104,11 @@ def main(args=None): if args.llm_model: from openai import OpenAI - llm_client = OpenAI(base_url=args.llm_client_url) + headers = {} + for header in args.llm_client_header: + key, value = header.split(":", 1) + headers[key] = value.lstrip() + llm_client = OpenAI(base_url=args.llm_client_url, default_headers=headers) else: llm_client = None