Make this a TypeScript SDK

a
2025-01-13 20:10:14 +07:00 · 2025-01-13 20:10:14 +07:00 · 8176a4e2cb
commit 8176a4e2cb
parent f58a864951
12 changed files with 1195 additions and 1897 deletions
--- a/23
+++ b/23
@ -1,23 +0,0 @@
 FROM python:3.13-slim-bullseye
 USER root
 ARG INSTALL_GIT=false
 RUN if [ "$INSTALL_GIT" = "true" ]; then \
    apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
    fi
 # Runtime dependency
 RUN apt-get update && apt-get install -y --no-install-recommends \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 RUN pip install markitdown
 # Default USERID and GROUPID
 ARG USERID=10000
 ARG GROUPID=10000
 USER $USERID:$GROUPID
 ENTRYPOINT [ "markitdown" ]
--- a/README.md
+++ b/README.md
@ -39,28 +39,28 @@ You can also pipe content:
 cat path-to-file.pdf | markitdown
 ```
-### Python API
+### TypeScript SDK
-Basic usage in Python:
+Basic usage in TypeScript:
-```python
+```typescript
-from markitdown import MarkItDown
+import { MarkItDown } from 'markitdown';
-md = MarkItDown()
+const md = new MarkItDown();
-result = md.convert("test.xlsx")
+const result = md.convert('test.xlsx');
-print(result.text_content)
+console.log(result.text_content);
 ```
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
-```python
+```typescript
-from markitdown import MarkItDown
+import { MarkItDown } from 'markitdown';
-from openai import OpenAI
+import { OpenAI } from 'openai';
-client = OpenAI()
+const client = new OpenAI();
-md = MarkItDown(llm_client=client, llm_model="gpt-4o")
+const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o' });
-result = md.convert("example.jpg")
+const result = md.convert('example.jpg');
-print(result.text_content)
+console.log(result.text_content);
 ```
 ### Docker
@ -76,31 +76,34 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
-```python convert.py
+```typescript
-from markitdown import MarkItDown
+import { MarkItDown } from 'markitdown';
-from openai import OpenAI
+import { OpenAI } from 'openai';
-import os
+import * as fs from 'fs';
-client = OpenAI(api_key="your-api-key-here")
+import * as path from 'path';
 md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
 supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
 files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
 for file in files_to_convert:
    print(f"\nConverting {file}...")
    try:
        md_file = os.path.splitext(file)[0] + '.md'
        result = md.convert(file)
        with open(md_file, 'w') as f:
            f.write(result.text_content)
        print(f"Successfully converted {file} to {md_file}")
    except Exception as e:
        print(f"Error converting {file}: {str(e)}")
-print("\nAll conversions completed!")
+const client = new OpenAI({ apiKey: 'your-api-key-here' });
 const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o-2024-11-20' });
 const supportedExtensions = ['.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png'];
 const filesToConvert = fs.readdirSync('.').filter(file => supportedExtensions.includes(path.extname(file).toLowerCase()));
 filesToConvert.forEach(file => {
    console.log(`\nConverting ${file}...`);
    try {
        const mdFile = path.basename(file, path.extname(file)) + '.md';
        const result = md.convert(file);
        fs.writeFileSync(mdFile, result.text_content);
        console.log(`Successfully converted ${file} to ${mdFile}`);
    } catch (e) {
        console.error(`Error converting ${file}: ${e.message}`);
    }
 });
 console.log('\nAll conversions completed!');
 ```
 2. Place the script in the same directory as your files
 3. Install required packages: like openai
-4. Run script ```bash python convert.py ```
+4. Run script ```bash ts-node convert.ts ```
 Note that original files will remain unchanged and new markdown files are created with the same base name.
--- a/package.json
+++ b/package.json
@ -0,0 +1,19 @@
 {
  "name": "markitdown",
  "version": "0.0.1",
  "description": "Utility tool for converting various files to Markdown",
  "main": "dist/index.js",
  "scripts": {
    "build": "tsc",
    "start": "node dist/index.js",
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "Adam Fourney",
  "license": "MIT",
  "dependencies": {
    "axios": "^0.21.1"
  },
  "devDependencies": {
    "typescript": "^4.4.3"
  }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,85 +0,0 @@
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [project]
 name = "markitdown"
 dynamic = ["version"]
 description = 'Utility tool for converting various files to Markdown'
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
 keywords = []
 authors = [
  { name = "Adam Fourney", email = "adamfo@microsoft.com" },
 ]
 classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: 3.13",
  "Programming Language :: Python :: Implementation :: CPython",
  "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
  "beautifulsoup4",
  "requests",
  "mammoth",
  "markdownify",
  "numpy",
  "python-pptx",
  "pandas",
  "openpyxl",
  "xlrd",
  "pdfminer.six",
  "puremagic",
  "pydub",
  "olefile",
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
  "charset-normalizer",
  "openai",
 ]
 [project.urls]
 Documentation = "https://github.com/microsoft/markitdown#readme"
 Issues = "https://github.com/microsoft/markitdown/issues"
 Source = "https://github.com/microsoft/markitdown"
 [tool.hatch.version]
 path = "src/markitdown/__about__.py"
 [project.scripts]
 markitdown = "markitdown.__main__:main"
 [tool.hatch.envs.types]
 extra-dependencies = [
  "mypy>=1.0.0",
 ]
 [tool.hatch.envs.types.scripts]
 check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
 [tool.coverage.run]
 source_pkgs = ["markitdown", "tests"]
 branch = true
 parallel = true
 omit = [
  "src/markitdown/__about__.py",
 ]
 [tool.coverage.paths]
 markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
 tests = ["tests", "*/markitdown/tests"]
 [tool.coverage.report]
 exclude_lines = [
  "no cov",
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
 [tool.hatch.build.targets.sdist]
 only-include = ["src/markitdown"]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@ -1,4 +0,0 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
 __version__ = "0.0.1a3"
--- a/src/markitdown/init.py
+++ b/src/markitdown/init.py
@ -1,11 +0,0 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
 from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
 __all__ = [
    "MarkItDown",
    "FileConversionException",
    "UnsupportedFormatException",
 ]
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@ -1,82 +0,0 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
 import argparse
 import sys
 from textwrap import dedent
 from .__about__ import __version__
 from ._markitdown import MarkItDown, DocumentConverterResult
 def main():
    parser = argparse.ArgumentParser(
        description="Convert various file formats to markdown.",
        prog="markitdown",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        usage=dedent(
            """
            SYNTAX:
                markitdown <OPTIONAL: FILENAME>
                If FILENAME is empty, markitdown reads from stdin.
            EXAMPLE:
                markitdown example.pdf
                OR
                cat example.pdf | markitdown
                OR
                markitdown < example.pdf
                OR to save to a file use
                markitdown example.pdf -o example.md
                OR
                markitdown example.pdf > example.md
            """
        ).strip(),
    )
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version=f"%(prog)s {__version__}",
        help="show the version number and exit",
    )
    parser.add_argument("filename", nargs="?")
    parser.add_argument(
        "-o",
        "--output",
        help="Output file name. If not provided, output is written to stdout.",
    )
    args = parser.parse_args()
    if args.filename is None:
        markitdown = MarkItDown()
        result = markitdown.convert_stream(sys.stdin.buffer)
        _handle_output(args, result)
    else:
        markitdown = MarkItDown()
        result = markitdown.convert(args.filename)
        _handle_output(args, result)
 def _handle_output(args, result: DocumentConverterResult):
    """Handle output to stdout or file"""
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(result.text_content)
    else:
        print(result.text_content)
 if __name__ == "__main__":
    main()
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
--- a/src/markitdown/index.ts
+++ b/src/markitdown/index.ts
@ -0,0 +1,52 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import * as readline from 'readline';
 import { MarkItDown, DocumentConverterResult } from './markitdown';
 const markitdown = new MarkItDown();
 function convertFile(filePath: string, outputFilePath?: string): void {
    const result: DocumentConverterResult = markitdown.convert(filePath);
    handleOutput(result, outputFilePath);
 }
 function convertStream(inputStream: NodeJS.ReadableStream, outputFilePath?: string): void {
    const result: DocumentConverterResult = markitdown.convertStream(inputStream);
    handleOutput(result, outputFilePath);
 }
 function handleOutput(result: DocumentConverterResult, outputFilePath?: string): void {
    if (outputFilePath) {
        fs.writeFileSync(outputFilePath, result.text_content, 'utf-8');
    } else {
        console.log(result.text_content);
    }
 }
 function main(): void {
    const args = process.argv.slice(2);
    const inputFilePath = args[0];
    const outputFilePath = args[1];
    if (inputFilePath) {
        convertFile(inputFilePath, outputFilePath);
    } else {
        const rl = readline.createInterface({
            input: process.stdin,
            output: process.stdout,
            terminal: false
        });
        let inputData = '';
        rl.on('line', (line) => {
            inputData += line + '\n';
        });
        rl.on('close', () => {
            const inputStream = fs.createReadStream(inputData);
            convertStream(inputStream, outputFilePath);
        });
    }
 }
 main();
--- a/src/markitdown/markitdown.ts
+++ b/src/markitdown/markitdown.ts
--- a/src/markitdown/py.typed
+++ b/src/markitdown/py.typed
--- a/tsconfig.json
+++ b/tsconfig.json
@ -0,0 +1,14 @@
 {
  "compilerOptions": {
    "target": "ES6",
    "module": "commonjs",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true,
    "outDir": "./dist",
    "rootDir": "./src"
  },
  "include": ["src/**/*.ts"],
  "exclude": ["node_modules", "**/*.spec.ts"]
 }