Make this a TypeScript SDK

a
This commit is contained in:
uratmangun.ovh 2025-01-13 20:10:14 +07:00
parent f58a864951
commit 8176a4e2cb
12 changed files with 1195 additions and 1897 deletions

View file

@ -1,23 +0,0 @@
FROM python:3.13-slim-bullseye
USER root
ARG INSTALL_GIT=false
RUN if [ "$INSTALL_GIT" = "true" ]; then \
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
fi
# Runtime dependency
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

View file

@ -39,28 +39,28 @@ You can also pipe content:
cat path-to-file.pdf | markitdown cat path-to-file.pdf | markitdown
``` ```
### Python API ### TypeScript SDK
Basic usage in Python: Basic usage in TypeScript:
```python ```typescript
from markitdown import MarkItDown import { MarkItDown } from 'markitdown';
md = MarkItDown() const md = new MarkItDown();
result = md.convert("test.xlsx") const result = md.convert('test.xlsx');
print(result.text_content) console.log(result.text_content);
``` ```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python ```typescript
from markitdown import MarkItDown import { MarkItDown } from 'markitdown';
from openai import OpenAI import { OpenAI } from 'openai';
client = OpenAI() const client = new OpenAI();
md = MarkItDown(llm_client=client, llm_model="gpt-4o") const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o' });
result = md.convert("example.jpg") const result = md.convert('example.jpg');
print(result.text_content) console.log(result.text_content);
``` ```
### Docker ### Docker
@ -76,31 +76,34 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files. This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py ```typescript
from markitdown import MarkItDown import { MarkItDown } from 'markitdown';
from openai import OpenAI import { OpenAI } from 'openai';
import os import * as fs from 'fs';
client = OpenAI(api_key="your-api-key-here") import * as path from 'path';
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!") const client = new OpenAI({ apiKey: 'your-api-key-here' });
const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o-2024-11-20' });
const supportedExtensions = ['.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png'];
const filesToConvert = fs.readdirSync('.').filter(file => supportedExtensions.includes(path.extname(file).toLowerCase()));
filesToConvert.forEach(file => {
console.log(`\nConverting ${file}...`);
try {
const mdFile = path.basename(file, path.extname(file)) + '.md';
const result = md.convert(file);
fs.writeFileSync(mdFile, result.text_content);
console.log(`Successfully converted ${file} to ${mdFile}`);
} catch (e) {
console.error(`Error converting ${file}: ${e.message}`);
}
});
console.log('\nAll conversions completed!');
``` ```
2. Place the script in the same directory as your files 2. Place the script in the same directory as your files
3. Install required packages: like openai 3. Install required packages: like openai
4. Run script ```bash python convert.py ``` 4. Run script ```bash ts-node convert.ts ```
Note that original files will remain unchanged and new markdown files are created with the same base name. Note that original files will remain unchanged and new markdown files are created with the same base name.

19
package.json Normal file
View file

@ -0,0 +1,19 @@
{
"name": "markitdown",
"version": "0.0.1",
"description": "Utility tool for converting various files to Markdown",
"main": "dist/index.js",
"scripts": {
"build": "tsc",
"start": "node dist/index.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Adam Fourney",
"license": "MIT",
"dependencies": {
"axios": "^0.21.1"
},
"devDependencies": {
"typescript": "^4.4.3"
}
}

View file

@ -1,85 +0,0 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "markitdown"
dynamic = ["version"]
description = 'Utility tool for converting various files to Markdown'
readme = "README.md"
requires-python = ">=3.10"
license = "MIT"
keywords = []
authors = [
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
]
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"beautifulsoup4",
"requests",
"mammoth",
"markdownify",
"numpy",
"python-pptx",
"pandas",
"openpyxl",
"xlrd",
"pdfminer.six",
"puremagic",
"pydub",
"olefile",
"youtube-transcript-api",
"SpeechRecognition",
"pathvalidate",
"charset-normalizer",
"openai",
]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"
Issues = "https://github.com/microsoft/markitdown/issues"
Source = "https://github.com/microsoft/markitdown"
[tool.hatch.version]
path = "src/markitdown/__about__.py"
[project.scripts]
markitdown = "markitdown.__main__:main"
[tool.hatch.envs.types]
extra-dependencies = [
"mypy>=1.0.0",
]
[tool.hatch.envs.types.scripts]
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
[tool.coverage.run]
source_pkgs = ["markitdown", "tests"]
branch = true
parallel = true
omit = [
"src/markitdown/__about__.py",
]
[tool.coverage.paths]
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
tests = ["tests", "*/markitdown/tests"]
[tool.coverage.report]
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]
[tool.hatch.build.targets.sdist]
only-include = ["src/markitdown"]

View file

@ -1,4 +0,0 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1a3"

View file

@ -1,11 +0,0 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
__all__ = [
"MarkItDown",
"FileConversionException",
"UnsupportedFormatException",
]

View file

@ -1,82 +0,0 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
import argparse
import sys
from textwrap import dedent
from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult
def main():
parser = argparse.ArgumentParser(
description="Convert various file formats to markdown.",
prog="markitdown",
formatter_class=argparse.RawDescriptionHelpFormatter,
usage=dedent(
"""
SYNTAX:
markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin.
EXAMPLE:
markitdown example.pdf
OR
cat example.pdf | markitdown
OR
markitdown < example.pdf
OR to save to a file use
markitdown example.pdf -o example.md
OR
markitdown example.pdf > example.md
"""
).strip(),
)
parser.add_argument(
"-v",
"--version",
action="version",
version=f"%(prog)s {__version__}",
help="show the version number and exit",
)
parser.add_argument("filename", nargs="?")
parser.add_argument(
"-o",
"--output",
help="Output file name. If not provided, output is written to stdout.",
)
args = parser.parse_args()
if args.filename is None:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
_handle_output(args, result)
else:
markitdown = MarkItDown()
result = markitdown.convert(args.filename)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.text_content)
else:
print(result.text_content)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

52
src/markitdown/index.ts Normal file
View file

@ -0,0 +1,52 @@
import * as fs from 'fs';
import * as path from 'path';
import * as readline from 'readline';
import { MarkItDown, DocumentConverterResult } from './markitdown';
const markitdown = new MarkItDown();
function convertFile(filePath: string, outputFilePath?: string): void {
const result: DocumentConverterResult = markitdown.convert(filePath);
handleOutput(result, outputFilePath);
}
function convertStream(inputStream: NodeJS.ReadableStream, outputFilePath?: string): void {
const result: DocumentConverterResult = markitdown.convertStream(inputStream);
handleOutput(result, outputFilePath);
}
function handleOutput(result: DocumentConverterResult, outputFilePath?: string): void {
if (outputFilePath) {
fs.writeFileSync(outputFilePath, result.text_content, 'utf-8');
} else {
console.log(result.text_content);
}
}
function main(): void {
const args = process.argv.slice(2);
const inputFilePath = args[0];
const outputFilePath = args[1];
if (inputFilePath) {
convertFile(inputFilePath, outputFilePath);
} else {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
let inputData = '';
rl.on('line', (line) => {
inputData += line + '\n';
});
rl.on('close', () => {
const inputStream = fs.createReadStream(inputData);
convertStream(inputStream, outputFilePath);
});
}
}
main();

1072
src/markitdown/markitdown.ts Normal file

File diff suppressed because it is too large Load diff

14
tsconfig.json Normal file
View file

@ -0,0 +1,14 @@
{
"compilerOptions": {
"target": "ES6",
"module": "commonjs",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"outDir": "./dist",
"rootDir": "./src"
},
"include": ["src/**/*.ts"],
"exclude": ["node_modules", "**/*.spec.ts"]
}