From c0c0533a5edd29678ecbeedeb82385e9e2b4beef Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Tue, 17 Dec 2024 05:51:30 +0000 Subject: [PATCH 1/6] slight tidy - follow-up to #46 --- .../markitdown/src/markitdown/__main__.py | 55 +++++-------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6a24391..a74320a 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -14,36 +14,14 @@ def main(): description="Convert various file formats to markdown.", prog="markitdown", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=dedent( - """ - SYNTAX: - - markitdown - If FILENAME is empty, markitdown reads from stdin. - - EXAMPLE: - - markitdown example.pdf - - OR - - cat example.pdf | markitdown - - OR - - markitdown < example.pdf - - OR to save to a file use - - markitdown example.pdf -o example.md - - OR - - markitdown example.pdf > example.md - """ - ).strip(), + epilog=dedent( + """\ + examples: + markitdown example.pdf + markitdown -o example.md example.pdf + cat example.pdf | markitdown > example.md""" + ), ) - parser.add_argument( "-v", "--version", @@ -51,41 +29,39 @@ def main(): version=f"%(prog)s {__version__}", help="show the version number and exit", ) - parser.add_argument( "-o", "--output", - help="Output file name. If not provided, output is written to stdout.", + dest="filename", + help="if unspecified, defaults to stdout", ) - parser.add_argument( "-d", "--use-docintel", action="store_true", help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", ) - parser.add_argument( "-e", "--endpoint", type=str, help="Document Intelligence Endpoint. Required if using Document Intelligence.", ) - parser.add_argument( "-p", "--use-plugins", action="store_true", help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", ) - parser.add_argument( "--list-plugins", action="store_true", help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", ) + parser.add_argument( + "filename", nargs="?", help="if unspecified, defaults to stdin" + ) - parser.add_argument("filename", nargs="?") args = parser.parse_args() if args.list_plugins: @@ -123,14 +99,9 @@ def main(): else: result = markitdown.convert(args.filename) - _handle_output(args, result) - - -def _handle_output(args, result: DocumentConverterResult): - """Handle output to stdout or file""" if args.output: with open(args.output, "w", encoding="utf-8") as f: - f.write(result.text_content) + print(result.text_content, file=f) else: print(result.text_content) From b0406ca2c798f485103dfb225fddc453a91322b1 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sun, 22 Dec 2024 08:36:55 +0000 Subject: [PATCH 2/6] global parser makes external integration with e.g. `shtab` easy --- .../markitdown/src/markitdown/__main__.py | 124 +++++++++--------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index a74320a..60afc3e 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -8,77 +8,77 @@ from importlib.metadata import entry_points from .__about__ import __version__ from ._markitdown import MarkItDown, DocumentConverterResult +parser = argparse.ArgumentParser( + description="Convert various file formats to markdown.", + prog="markitdown", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=dedent( + """\ + examples: + markitdown example.pdf + markitdown -o example.md example.pdf + cat example.pdf | markitdown > example.md""" + ), +) +parser.add_argument( + "-v", + "--version", + action="version", + version=f"%(prog)s {__version__}", + help="show the version number and exit", +) +parser.add_argument( + "-o", + "--output", + metavar="outfilename", + help="if unspecified, defaults to stdout", +) +parser.add_argument( + "-d", + "--use-docintel", + action="store_true", + help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", +) +parser.add_argument( + "-e", + "--endpoint", + type=str, + help="Document Intelligence Endpoint. Required if using Document Intelligence.", +) +parser.add_argument( + "-p", + "--use-plugins", + action="store_true", + help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", +) +parser.add_argument( + "--list-plugins", + action="store_true", + help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", +) +parser.add_argument( + "filename", nargs="?", help="if unspecified, defaults to stdin" +) -def main(): - parser = argparse.ArgumentParser( - description="Convert various file formats to markdown.", - prog="markitdown", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=dedent( - """\ - examples: - markitdown example.pdf - markitdown -o example.md example.pdf - cat example.pdf | markitdown > example.md""" - ), - ) - parser.add_argument( - "-v", - "--version", - action="version", - version=f"%(prog)s {__version__}", - help="show the version number and exit", - ) - parser.add_argument( - "-o", - "--output", - dest="filename", - help="if unspecified, defaults to stdout", - ) - parser.add_argument( - "-d", - "--use-docintel", - action="store_true", - help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", - ) - parser.add_argument( - "-e", - "--endpoint", - type=str, - help="Document Intelligence Endpoint. Required if using Document Intelligence.", - ) - parser.add_argument( - "-p", - "--use-plugins", - action="store_true", - help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", - ) - parser.add_argument( - "--list-plugins", - action="store_true", - help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", - ) - parser.add_argument( - "filename", nargs="?", help="if unspecified, defaults to stdin" - ) - args = parser.parse_args() +def main(args=None): + args = parser.parse_args(args) if args.list_plugins: # List installed plugins, then exit print("Installed MarkItDown 3rd-party Plugins:\n") plugin_entry_points = list(entry_points(group="markitdown.plugin")) - if len(plugin_entry_points) == 0: - print(" * No 3rd-party plugins installed.") - print( - "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n" - ) - else: + if plugin_entry_points: for entry_point in plugin_entry_points: print(f" * {entry_point.name:<16}\t(package: {entry_point.value})") print( "\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n" ) + else: + print("No 3rd-party plugins installed.") + print( + "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n" + ) sys.exit(0) if args.use_docintel: @@ -94,10 +94,10 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) - if args.filename is None: - result = markitdown.convert_stream(sys.stdin.buffer) - else: + if args.filename: result = markitdown.convert(args.filename) + else: + result = markitdown.convert_stream(sys.stdin.buffer) if args.output: with open(args.output, "w", encoding="utf-8") as f: From 38feb5e7a1a37d3ca25fdc5fd34fcfc5d43fa4db Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Thu, 6 Mar 2025 14:07:01 +0000 Subject: [PATCH 3/6] tidy docstrings --- packages/markitdown/src/markitdown/__main__.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 60afc3e..a91e1fb 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -37,28 +37,26 @@ parser.add_argument( "-d", "--use-docintel", action="store_true", - help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", + help="use online Document Intelligence to extract text (requires a valid `--endpoint`)", ) parser.add_argument( "-e", "--endpoint", type=str, - help="Document Intelligence Endpoint. Required if using Document Intelligence.", + help="required for `--use-docintel`", ) parser.add_argument( "-p", "--use-plugins", action="store_true", - help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", + help="use 3rd-party plugins to convert files (see `--list-plugins`)", ) parser.add_argument( "--list-plugins", action="store_true", - help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", -) -parser.add_argument( - "filename", nargs="?", help="if unspecified, defaults to stdin" + help="list installed 3rd-party plugins (loaded with `--use-plugin`)", ) +parser.add_argument("filename", nargs="?", help="if unspecified, defaults to stdin") def main(args=None): From e270e63bbcccd4ebd82e7dcd2f87f56df90931ad Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Thu, 6 Mar 2025 14:22:16 +0000 Subject: [PATCH 4/6] standardise metavars --- packages/markitdown/src/markitdown/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index a91e1fb..b048c88 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -30,7 +30,7 @@ parser.add_argument( parser.add_argument( "-o", "--output", - metavar="outfilename", + metavar="OUTFILENAME", help="if unspecified, defaults to stdout", ) parser.add_argument( @@ -56,7 +56,9 @@ parser.add_argument( action="store_true", help="list installed 3rd-party plugins (loaded with `--use-plugin`)", ) -parser.add_argument("filename", nargs="?", help="if unspecified, defaults to stdin") +parser.add_argument( + "filename", metavar="FILENAME", nargs="?", help="if unspecified, defaults to stdin" +) def main(args=None): From 3ebe8dfacb705866f27e2c45cb7826a24628a2e0 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Thu, 6 Mar 2025 14:22:29 +0000 Subject: [PATCH 5/6] slight if-else tidy --- packages/markitdown/src/markitdown/__main__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index b048c88..ed43289 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -88,11 +88,11 @@ def main(args=None): ) elif args.filename is None: raise ValueError("Filename is required when using Document Intelligence.") - markitdown = MarkItDown( - enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint - ) - else: - markitdown = MarkItDown(enable_plugins=args.use_plugins) + + markitdown = MarkItDown( + enable_plugins=args.use_plugins, + docintel_endpoint=args.endpoint if args.use_docintel else None, + ) if args.filename: result = markitdown.convert(args.filename) From 011328920b63ed169d986f1c2083a2d28c1f5bd6 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 8 Mar 2025 16:12:11 +0000 Subject: [PATCH 6/6] update tests --- packages/markitdown/tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/tests/test_cli.py b/packages/markitdown/tests/test_cli.py index 7c8afc2..02004af 100644 --- a/packages/markitdown/tests/test_cli.py +++ b/packages/markitdown/tests/test_cli.py @@ -33,7 +33,7 @@ def test_invalid_flag(shared_tmp_dir) -> None: assert ( "unrecognized arguments" in result.stderr ), f"Expected 'unrecognized arguments' to appear in STDERR" - assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR" + assert "usage" in result.stderr, f"Expected 'usage' to appear in STDERR" def test_output_to_stdout(shared_tmp_dir) -> None: