Make this a TypeScript SDK
a
This commit is contained in:
parent
f58a864951
commit
8176a4e2cb
12 changed files with 1195 additions and 1897 deletions
23
Dockerfile
23
Dockerfile
|
|
@ -1,23 +0,0 @@
|
||||||
FROM python:3.13-slim-bullseye
|
|
||||||
|
|
||||||
USER root
|
|
||||||
|
|
||||||
ARG INSTALL_GIT=false
|
|
||||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
|
||||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Runtime dependency
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
ffmpeg \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN pip install markitdown
|
|
||||||
|
|
||||||
# Default USERID and GROUPID
|
|
||||||
ARG USERID=10000
|
|
||||||
ARG GROUPID=10000
|
|
||||||
|
|
||||||
USER $USERID:$GROUPID
|
|
||||||
|
|
||||||
ENTRYPOINT [ "markitdown" ]
|
|
||||||
73
README.md
73
README.md
|
|
@ -39,28 +39,28 @@ You can also pipe content:
|
||||||
cat path-to-file.pdf | markitdown
|
cat path-to-file.pdf | markitdown
|
||||||
```
|
```
|
||||||
|
|
||||||
### Python API
|
### TypeScript SDK
|
||||||
|
|
||||||
Basic usage in Python:
|
Basic usage in TypeScript:
|
||||||
|
|
||||||
```python
|
```typescript
|
||||||
from markitdown import MarkItDown
|
import { MarkItDown } from 'markitdown';
|
||||||
|
|
||||||
md = MarkItDown()
|
const md = new MarkItDown();
|
||||||
result = md.convert("test.xlsx")
|
const result = md.convert('test.xlsx');
|
||||||
print(result.text_content)
|
console.log(result.text_content);
|
||||||
```
|
```
|
||||||
|
|
||||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||||
|
|
||||||
```python
|
```typescript
|
||||||
from markitdown import MarkItDown
|
import { MarkItDown } from 'markitdown';
|
||||||
from openai import OpenAI
|
import { OpenAI } from 'openai';
|
||||||
|
|
||||||
client = OpenAI()
|
const client = new OpenAI();
|
||||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o' });
|
||||||
result = md.convert("example.jpg")
|
const result = md.convert('example.jpg');
|
||||||
print(result.text_content)
|
console.log(result.text_content);
|
||||||
```
|
```
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
@ -76,31 +76,34 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
||||||
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
|
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
|
||||||
|
|
||||||
|
|
||||||
```python convert.py
|
```typescript
|
||||||
from markitdown import MarkItDown
|
import { MarkItDown } from 'markitdown';
|
||||||
from openai import OpenAI
|
import { OpenAI } from 'openai';
|
||||||
import os
|
import * as fs from 'fs';
|
||||||
client = OpenAI(api_key="your-api-key-here")
|
import * as path from 'path';
|
||||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
|
|
||||||
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
|
|
||||||
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
|
|
||||||
for file in files_to_convert:
|
|
||||||
print(f"\nConverting {file}...")
|
|
||||||
try:
|
|
||||||
md_file = os.path.splitext(file)[0] + '.md'
|
|
||||||
result = md.convert(file)
|
|
||||||
with open(md_file, 'w') as f:
|
|
||||||
f.write(result.text_content)
|
|
||||||
|
|
||||||
print(f"Successfully converted {file} to {md_file}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error converting {file}: {str(e)}")
|
|
||||||
|
|
||||||
print("\nAll conversions completed!")
|
const client = new OpenAI({ apiKey: 'your-api-key-here' });
|
||||||
|
const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o-2024-11-20' });
|
||||||
|
const supportedExtensions = ['.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png'];
|
||||||
|
const filesToConvert = fs.readdirSync('.').filter(file => supportedExtensions.includes(path.extname(file).toLowerCase()));
|
||||||
|
|
||||||
|
filesToConvert.forEach(file => {
|
||||||
|
console.log(`\nConverting ${file}...`);
|
||||||
|
try {
|
||||||
|
const mdFile = path.basename(file, path.extname(file)) + '.md';
|
||||||
|
const result = md.convert(file);
|
||||||
|
fs.writeFileSync(mdFile, result.text_content);
|
||||||
|
console.log(`Successfully converted ${file} to ${mdFile}`);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`Error converting ${file}: ${e.message}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('\nAll conversions completed!');
|
||||||
```
|
```
|
||||||
2. Place the script in the same directory as your files
|
2. Place the script in the same directory as your files
|
||||||
3. Install required packages: like openai
|
3. Install required packages: like openai
|
||||||
4. Run script ```bash python convert.py ```
|
4. Run script ```bash ts-node convert.ts ```
|
||||||
|
|
||||||
Note that original files will remain unchanged and new markdown files are created with the same base name.
|
Note that original files will remain unchanged and new markdown files are created with the same base name.
|
||||||
|
|
||||||
|
|
|
||||||
19
package.json
Normal file
19
package.json
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
{
|
||||||
|
"name": "markitdown",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"description": "Utility tool for converting various files to Markdown",
|
||||||
|
"main": "dist/index.js",
|
||||||
|
"scripts": {
|
||||||
|
"build": "tsc",
|
||||||
|
"start": "node dist/index.js",
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "Adam Fourney",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^0.21.1"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"typescript": "^4.4.3"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,85 +0,0 @@
|
||||||
[build-system]
|
|
||||||
requires = ["hatchling"]
|
|
||||||
build-backend = "hatchling.build"
|
|
||||||
|
|
||||||
[project]
|
|
||||||
name = "markitdown"
|
|
||||||
dynamic = ["version"]
|
|
||||||
description = 'Utility tool for converting various files to Markdown'
|
|
||||||
readme = "README.md"
|
|
||||||
requires-python = ">=3.10"
|
|
||||||
license = "MIT"
|
|
||||||
keywords = []
|
|
||||||
authors = [
|
|
||||||
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
|
|
||||||
]
|
|
||||||
classifiers = [
|
|
||||||
"Development Status :: 4 - Beta",
|
|
||||||
"Programming Language :: Python",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
"Programming Language :: Python :: 3.11",
|
|
||||||
"Programming Language :: Python :: 3.12",
|
|
||||||
"Programming Language :: Python :: 3.13",
|
|
||||||
"Programming Language :: Python :: Implementation :: CPython",
|
|
||||||
"Programming Language :: Python :: Implementation :: PyPy",
|
|
||||||
]
|
|
||||||
dependencies = [
|
|
||||||
"beautifulsoup4",
|
|
||||||
"requests",
|
|
||||||
"mammoth",
|
|
||||||
"markdownify",
|
|
||||||
"numpy",
|
|
||||||
"python-pptx",
|
|
||||||
"pandas",
|
|
||||||
"openpyxl",
|
|
||||||
"xlrd",
|
|
||||||
"pdfminer.six",
|
|
||||||
"puremagic",
|
|
||||||
"pydub",
|
|
||||||
"olefile",
|
|
||||||
"youtube-transcript-api",
|
|
||||||
"SpeechRecognition",
|
|
||||||
"pathvalidate",
|
|
||||||
"charset-normalizer",
|
|
||||||
"openai",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.urls]
|
|
||||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
|
||||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
|
||||||
Source = "https://github.com/microsoft/markitdown"
|
|
||||||
|
|
||||||
[tool.hatch.version]
|
|
||||||
path = "src/markitdown/__about__.py"
|
|
||||||
|
|
||||||
[project.scripts]
|
|
||||||
markitdown = "markitdown.__main__:main"
|
|
||||||
|
|
||||||
[tool.hatch.envs.types]
|
|
||||||
extra-dependencies = [
|
|
||||||
"mypy>=1.0.0",
|
|
||||||
]
|
|
||||||
[tool.hatch.envs.types.scripts]
|
|
||||||
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
|
||||||
|
|
||||||
[tool.coverage.run]
|
|
||||||
source_pkgs = ["markitdown", "tests"]
|
|
||||||
branch = true
|
|
||||||
parallel = true
|
|
||||||
omit = [
|
|
||||||
"src/markitdown/__about__.py",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.coverage.paths]
|
|
||||||
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
|
|
||||||
tests = ["tests", "*/markitdown/tests"]
|
|
||||||
|
|
||||||
[tool.coverage.report]
|
|
||||||
exclude_lines = [
|
|
||||||
"no cov",
|
|
||||||
"if __name__ == .__main__.:",
|
|
||||||
"if TYPE_CHECKING:",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.hatch.build.targets.sdist]
|
|
||||||
only-include = ["src/markitdown"]
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
|
||||||
#
|
|
||||||
# SPDX-License-Identifier: MIT
|
|
||||||
__version__ = "0.0.1a3"
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
|
||||||
#
|
|
||||||
# SPDX-License-Identifier: MIT
|
|
||||||
|
|
||||||
from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"MarkItDown",
|
|
||||||
"FileConversionException",
|
|
||||||
"UnsupportedFormatException",
|
|
||||||
]
|
|
||||||
|
|
@ -1,82 +0,0 @@
|
||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
|
||||||
#
|
|
||||||
# SPDX-License-Identifier: MIT
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
from textwrap import dedent
|
|
||||||
from .__about__ import __version__
|
|
||||||
from ._markitdown import MarkItDown, DocumentConverterResult
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Convert various file formats to markdown.",
|
|
||||||
prog="markitdown",
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
usage=dedent(
|
|
||||||
"""
|
|
||||||
SYNTAX:
|
|
||||||
|
|
||||||
markitdown <OPTIONAL: FILENAME>
|
|
||||||
If FILENAME is empty, markitdown reads from stdin.
|
|
||||||
|
|
||||||
EXAMPLE:
|
|
||||||
|
|
||||||
markitdown example.pdf
|
|
||||||
|
|
||||||
OR
|
|
||||||
|
|
||||||
cat example.pdf | markitdown
|
|
||||||
|
|
||||||
OR
|
|
||||||
|
|
||||||
markitdown < example.pdf
|
|
||||||
|
|
||||||
OR to save to a file use
|
|
||||||
|
|
||||||
markitdown example.pdf -o example.md
|
|
||||||
|
|
||||||
OR
|
|
||||||
|
|
||||||
markitdown example.pdf > example.md
|
|
||||||
"""
|
|
||||||
).strip(),
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"-v",
|
|
||||||
"--version",
|
|
||||||
action="version",
|
|
||||||
version=f"%(prog)s {__version__}",
|
|
||||||
help="show the version number and exit",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("filename", nargs="?")
|
|
||||||
parser.add_argument(
|
|
||||||
"-o",
|
|
||||||
"--output",
|
|
||||||
help="Output file name. If not provided, output is written to stdout.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.filename is None:
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
|
||||||
_handle_output(args, result)
|
|
||||||
else:
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
result = markitdown.convert(args.filename)
|
|
||||||
_handle_output(args, result)
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_output(args, result: DocumentConverterResult):
|
|
||||||
"""Handle output to stdout or file"""
|
|
||||||
if args.output:
|
|
||||||
with open(args.output, "w", encoding="utf-8") as f:
|
|
||||||
f.write(result.text_content)
|
|
||||||
else:
|
|
||||||
print(result.text_content)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
File diff suppressed because it is too large
Load diff
52
src/markitdown/index.ts
Normal file
52
src/markitdown/index.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as readline from 'readline';
|
||||||
|
import { MarkItDown, DocumentConverterResult } from './markitdown';
|
||||||
|
|
||||||
|
const markitdown = new MarkItDown();
|
||||||
|
|
||||||
|
function convertFile(filePath: string, outputFilePath?: string): void {
|
||||||
|
const result: DocumentConverterResult = markitdown.convert(filePath);
|
||||||
|
handleOutput(result, outputFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertStream(inputStream: NodeJS.ReadableStream, outputFilePath?: string): void {
|
||||||
|
const result: DocumentConverterResult = markitdown.convertStream(inputStream);
|
||||||
|
handleOutput(result, outputFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleOutput(result: DocumentConverterResult, outputFilePath?: string): void {
|
||||||
|
if (outputFilePath) {
|
||||||
|
fs.writeFileSync(outputFilePath, result.text_content, 'utf-8');
|
||||||
|
} else {
|
||||||
|
console.log(result.text_content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function main(): void {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
const inputFilePath = args[0];
|
||||||
|
const outputFilePath = args[1];
|
||||||
|
|
||||||
|
if (inputFilePath) {
|
||||||
|
convertFile(inputFilePath, outputFilePath);
|
||||||
|
} else {
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: process.stdin,
|
||||||
|
output: process.stdout,
|
||||||
|
terminal: false
|
||||||
|
});
|
||||||
|
|
||||||
|
let inputData = '';
|
||||||
|
rl.on('line', (line) => {
|
||||||
|
inputData += line + '\n';
|
||||||
|
});
|
||||||
|
|
||||||
|
rl.on('close', () => {
|
||||||
|
const inputStream = fs.createReadStream(inputData);
|
||||||
|
convertStream(inputStream, outputFilePath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
1072
src/markitdown/markitdown.ts
Normal file
1072
src/markitdown/markitdown.ts
Normal file
File diff suppressed because it is too large
Load diff
14
tsconfig.json
Normal file
14
tsconfig.json
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES6",
|
||||||
|
"module": "commonjs",
|
||||||
|
"strict": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"outDir": "./dist",
|
||||||
|
"rootDir": "./src"
|
||||||
|
},
|
||||||
|
"include": ["src/**/*.ts"],
|
||||||
|
"exclude": ["node_modules", "**/*.spec.ts"]
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue