Make this a TypeScript SDK

Convert the project to a TypeScript SDK.

* Add `tsconfig.json` with TypeScript compiler options.
* Add `package.json` with TypeScript dependencies and build scripts.
* Add `src/markitdown/index.ts` to convert `src/markitdown/__main__.py` to TypeScript.
* Add `src/markitdown/markitdown.ts` to convert `src/markitdown/_markitdown.py` to TypeScript.
* Remove Python-specific files: `pyproject.toml`, `Dockerfile`, `src/markitdown/__main__.py`, `src/markitdown/_markitdown.py`, `src/markitdown/__init__.py`, `src/markitdown/__about__.py`, `src/markitdown/py.typed`.
* Update `README.md` to include TypeScript SDK usage instructions.
This commit is contained in:
uratmangun.ovh 2025-01-13 20:10:34 +07:00
parent f58a864951
commit 1e856c3eb6
12 changed files with 1195 additions and 1897 deletions

View file

@ -1,23 +0,0 @@
FROM python:3.13-slim-bullseye
USER root
ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
fi
# Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

View file

@ -39,28 +39,28 @@ You can also pipe content:
cat path-to-file.pdf | markitdown
```
### Python API
### TypeScript SDK
Basic usage in Python:
Basic usage in TypeScript:
```python
from markitdown import MarkItDown
```typescript
import { MarkItDown } from 'markitdown';
md = MarkItDown()
result = md.convert("test.xlsx")
print(result.text_content)
const md = new MarkItDown();
const result = md.convert('test.xlsx');
console.log(result.text_content);
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python
from markitdown import MarkItDown
from openai import OpenAI
```typescript
import { MarkItDown } from 'markitdown';
import { OpenAI } from 'openai';
client = OpenAI()
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert("example.jpg")
print(result.text_content)
const client = new OpenAI();
const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o' });
const result = md.convert('example.jpg');
console.log(result.text_content);
```
### Docker
@ -76,31 +76,34 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
```typescript
import { MarkItDown } from 'markitdown';
import { OpenAI } from 'openai';
import * as fs from 'fs';
import * as path from 'path';
print("\nAll conversions completed!")
const client = new OpenAI({ apiKey: 'your-api-key-here' });
const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o-2024-11-20' });
const supportedExtensions = ['.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png'];
const filesToConvert = fs.readdirSync('.').filter(file => supportedExtensions.includes(path.extname(file).toLowerCase()));
filesToConvert.forEach(file => {
console.log(`\nConverting ${file}...`);
try {
const mdFile = path.basename(file, path.extname(file)) + '.md';
const result = md.convert(file);
fs.writeFileSync(mdFile, result.text_content);
console.log(`Successfully converted ${file} to ${mdFile}`);
} catch (e) {
console.error(`Error converting ${file}: ${e.message}`);
}
});
console.log('\nAll conversions completed!');
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
4. Run script ```bash ts-node convert.ts ```
Note that original files will remain unchanged and new markdown files are created with the same base name.

19
package.json Normal file
View file

@ -0,0 +1,19 @@
{
"name": "markitdown",
"version": "0.0.1",
"description": "Utility tool for converting various files to Markdown",
"main": "dist/index.js",
"scripts": {
"build": "tsc",
"start": "node dist/index.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Adam Fourney",
"license": "MIT",
"dependencies": {
"axios": "^0.21.1"
},
"devDependencies": {
"typescript": "^4.4.3"
}
}

View file

@ -1,85 +0,0 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "markitdown"
dynamic = ["version"]
description = 'Utility tool for converting various files to Markdown'
readme = "README.md"
requires-python = ">=3.10"
license = "MIT"
keywords = []
authors = [
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
]
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"beautifulsoup4",
"requests",
"mammoth",
"markdownify",
"numpy",
"python-pptx",
"pandas",
"openpyxl",
"xlrd",
"pdfminer.six",
"puremagic",
"pydub",
"olefile",
"youtube-transcript-api",
"SpeechRecognition",
"pathvalidate",
"charset-normalizer",
"openai",
]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"
Issues = "https://github.com/microsoft/markitdown/issues"
Source = "https://github.com/microsoft/markitdown"
[tool.hatch.version]
path = "src/markitdown/__about__.py"
[project.scripts]
markitdown = "markitdown.__main__:main"
[tool.hatch.envs.types]
extra-dependencies = [
"mypy>=1.0.0",
]
[tool.hatch.envs.types.scripts]
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
[tool.coverage.run]
source_pkgs = ["markitdown", "tests"]
branch = true
parallel = true
omit = [
"src/markitdown/__about__.py",
]
[tool.coverage.paths]
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
tests = ["tests", "*/markitdown/tests"]
[tool.coverage.report]
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]
[tool.hatch.build.targets.sdist]
only-include = ["src/markitdown"]

View file

@ -1,4 +0,0 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1a3"

View file

@ -1,11 +0,0 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
__all__ = [
"MarkItDown",
"FileConversionException",
"UnsupportedFormatException",
]

View file

@ -1,82 +0,0 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
import argparse
import sys
from textwrap import dedent
from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult
def main():
parser = argparse.ArgumentParser(
description="Convert various file formats to markdown.",
prog="markitdown",
formatter_class=argparse.RawDescriptionHelpFormatter,
usage=dedent(
"""
SYNTAX:
markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin.
EXAMPLE:
markitdown example.pdf
OR
cat example.pdf | markitdown
OR
markitdown < example.pdf
OR to save to a file use
markitdown example.pdf -o example.md
OR
markitdown example.pdf > example.md
"""
).strip(),
)
parser.add_argument(
"-v",
"--version",
action="version",
version=f"%(prog)s {__version__}",
help="show the version number and exit",
)
parser.add_argument("filename", nargs="?")
parser.add_argument(
"-o",
"--output",
help="Output file name. If not provided, output is written to stdout.",
)
args = parser.parse_args()
if args.filename is None:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
_handle_output(args, result)
else:
markitdown = MarkItDown()
result = markitdown.convert(args.filename)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.text_content)
else:
print(result.text_content)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

52
src/markitdown/index.ts Normal file
View file

@ -0,0 +1,52 @@
import * as fs from 'fs';
import * as path from 'path';
import * as readline from 'readline';
import { MarkItDown, DocumentConverterResult } from './markitdown';
const markitdown = new MarkItDown();
function convertFile(filePath: string, outputFilePath?: string): void {
const result: DocumentConverterResult = markitdown.convert(filePath);
handleOutput(result, outputFilePath);
}
function convertStream(inputStream: NodeJS.ReadableStream, outputFilePath?: string): void {
const result: DocumentConverterResult = markitdown.convertStream(inputStream);
handleOutput(result, outputFilePath);
}
function handleOutput(result: DocumentConverterResult, outputFilePath?: string): void {
if (outputFilePath) {
fs.writeFileSync(outputFilePath, result.text_content, 'utf-8');
} else {
console.log(result.text_content);
}
}
function main(): void {
const args = process.argv.slice(2);
const inputFilePath = args[0];
const outputFilePath = args[1];
if (inputFilePath) {
convertFile(inputFilePath, outputFilePath);
} else {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
let inputData = '';
rl.on('line', (line) => {
inputData += line + '\n';
});
rl.on('close', () => {
const inputStream = fs.createReadStream(inputData);
convertStream(inputStream, outputFilePath);
});
}
}
main();

1072
src/markitdown/markitdown.ts Normal file

File diff suppressed because it is too large Load diff

14
tsconfig.json Normal file
View file

@ -0,0 +1,14 @@
{
"compilerOptions": {
"target": "ES6",
"module": "commonjs",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"outDir": "./dist",
"rootDir": "./src"
},
"include": ["src/**/*.ts"],
"exclude": ["node_modules", "**/*.spec.ts"]
}