Make this a TypeScript SDK
a
This commit is contained in:
parent
f58a864951
commit
8176a4e2cb
12 changed files with 1195 additions and 1897 deletions
23
Dockerfile
23
Dockerfile
|
|
@ -1,23 +0,0 @@
|
|||
FROM python:3.13-slim-bullseye
|
||||
|
||||
USER root
|
||||
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
|
||||
# Runtime dependency
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install markitdown
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=10000
|
||||
ARG GROUPID=10000
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
ENTRYPOINT [ "markitdown" ]
|
||||
71
README.md
71
README.md
|
|
@ -39,28 +39,28 @@ You can also pipe content:
|
|||
cat path-to-file.pdf | markitdown
|
||||
```
|
||||
|
||||
### Python API
|
||||
### TypeScript SDK
|
||||
|
||||
Basic usage in Python:
|
||||
Basic usage in TypeScript:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
```typescript
|
||||
import { MarkItDown } from 'markitdown';
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert("test.xlsx")
|
||||
print(result.text_content)
|
||||
const md = new MarkItDown();
|
||||
const result = md.convert('test.xlsx');
|
||||
console.log(result.text_content);
|
||||
```
|
||||
|
||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
```typescript
|
||||
import { MarkItDown } from 'markitdown';
|
||||
import { OpenAI } from 'openai';
|
||||
|
||||
client = OpenAI()
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
result = md.convert("example.jpg")
|
||||
print(result.text_content)
|
||||
const client = new OpenAI();
|
||||
const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o' });
|
||||
const result = md.convert('example.jpg');
|
||||
console.log(result.text_content);
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
|
@ -76,31 +76,34 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
|||
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
|
||||
|
||||
|
||||
```python convert.py
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
import os
|
||||
client = OpenAI(api_key="your-api-key-here")
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
|
||||
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
|
||||
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
|
||||
for file in files_to_convert:
|
||||
print(f"\nConverting {file}...")
|
||||
try:
|
||||
md_file = os.path.splitext(file)[0] + '.md'
|
||||
result = md.convert(file)
|
||||
with open(md_file, 'w') as f:
|
||||
f.write(result.text_content)
|
||||
```typescript
|
||||
import { MarkItDown } from 'markitdown';
|
||||
import { OpenAI } from 'openai';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
print(f"Successfully converted {file} to {md_file}")
|
||||
except Exception as e:
|
||||
print(f"Error converting {file}: {str(e)}")
|
||||
const client = new OpenAI({ apiKey: 'your-api-key-here' });
|
||||
const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o-2024-11-20' });
|
||||
const supportedExtensions = ['.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png'];
|
||||
const filesToConvert = fs.readdirSync('.').filter(file => supportedExtensions.includes(path.extname(file).toLowerCase()));
|
||||
|
||||
print("\nAll conversions completed!")
|
||||
filesToConvert.forEach(file => {
|
||||
console.log(`\nConverting ${file}...`);
|
||||
try {
|
||||
const mdFile = path.basename(file, path.extname(file)) + '.md';
|
||||
const result = md.convert(file);
|
||||
fs.writeFileSync(mdFile, result.text_content);
|
||||
console.log(`Successfully converted ${file} to ${mdFile}`);
|
||||
} catch (e) {
|
||||
console.error(`Error converting ${file}: ${e.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
console.log('\nAll conversions completed!');
|
||||
```
|
||||
2. Place the script in the same directory as your files
|
||||
3. Install required packages: like openai
|
||||
4. Run script ```bash python convert.py ```
|
||||
4. Run script ```bash ts-node convert.ts ```
|
||||
|
||||
Note that original files will remain unchanged and new markdown files are created with the same base name.
|
||||
|
||||
|
|
|
|||
19
package.json
Normal file
19
package.json
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"name": "markitdown",
|
||||
"version": "0.0.1",
|
||||
"description": "Utility tool for converting various files to Markdown",
|
||||
"main": "dist/index.js",
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"start": "node dist/index.js",
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"author": "Adam Fourney",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"axios": "^0.21.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "^4.4.3"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown"
|
||||
dynamic = ["version"]
|
||||
description = 'Utility tool for converting various files to Markdown'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
keywords = []
|
||||
authors = [
|
||||
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"mammoth",
|
||||
"markdownify",
|
||||
"numpy",
|
||||
"python-pptx",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"pdfminer.six",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
"olefile",
|
||||
"youtube-transcript-api",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
"openai",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown/__about__.py"
|
||||
|
||||
[project.scripts]
|
||||
markitdown = "markitdown.__main__:main"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown", "tests"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/markitdown/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.paths]
|
||||
markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
|
||||
tests = ["tests", "*/markitdown/tests"]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
"no cov",
|
||||
"if __name__ == .__main__.:",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
only-include = ["src/markitdown"]
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.0.1a3"
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
|
||||
|
||||
__all__ = [
|
||||
"MarkItDown",
|
||||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
]
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
import argparse
|
||||
import sys
|
||||
from textwrap import dedent
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown, DocumentConverterResult
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert various file formats to markdown.",
|
||||
prog="markitdown",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
usage=dedent(
|
||||
"""
|
||||
SYNTAX:
|
||||
|
||||
markitdown <OPTIONAL: FILENAME>
|
||||
If FILENAME is empty, markitdown reads from stdin.
|
||||
|
||||
EXAMPLE:
|
||||
|
||||
markitdown example.pdf
|
||||
|
||||
OR
|
||||
|
||||
cat example.pdf | markitdown
|
||||
|
||||
OR
|
||||
|
||||
markitdown < example.pdf
|
||||
|
||||
OR to save to a file use
|
||||
|
||||
markitdown example.pdf -o example.md
|
||||
|
||||
OR
|
||||
|
||||
markitdown example.pdf > example.md
|
||||
"""
|
||||
).strip(),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
action="version",
|
||||
version=f"%(prog)s {__version__}",
|
||||
help="show the version number and exit",
|
||||
)
|
||||
|
||||
parser.add_argument("filename", nargs="?")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="Output file name. If not provided, output is written to stdout.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.filename is None:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||
_handle_output(args, result)
|
||||
else:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(args.filename)
|
||||
_handle_output(args, result)
|
||||
|
||||
|
||||
def _handle_output(args, result: DocumentConverterResult):
|
||||
"""Handle output to stdout or file"""
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(result.text_content)
|
||||
else:
|
||||
print(result.text_content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load diff
52
src/markitdown/index.ts
Normal file
52
src/markitdown/index.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as readline from 'readline';
|
||||
import { MarkItDown, DocumentConverterResult } from './markitdown';
|
||||
|
||||
const markitdown = new MarkItDown();
|
||||
|
||||
function convertFile(filePath: string, outputFilePath?: string): void {
|
||||
const result: DocumentConverterResult = markitdown.convert(filePath);
|
||||
handleOutput(result, outputFilePath);
|
||||
}
|
||||
|
||||
function convertStream(inputStream: NodeJS.ReadableStream, outputFilePath?: string): void {
|
||||
const result: DocumentConverterResult = markitdown.convertStream(inputStream);
|
||||
handleOutput(result, outputFilePath);
|
||||
}
|
||||
|
||||
function handleOutput(result: DocumentConverterResult, outputFilePath?: string): void {
|
||||
if (outputFilePath) {
|
||||
fs.writeFileSync(outputFilePath, result.text_content, 'utf-8');
|
||||
} else {
|
||||
console.log(result.text_content);
|
||||
}
|
||||
}
|
||||
|
||||
function main(): void {
|
||||
const args = process.argv.slice(2);
|
||||
const inputFilePath = args[0];
|
||||
const outputFilePath = args[1];
|
||||
|
||||
if (inputFilePath) {
|
||||
convertFile(inputFilePath, outputFilePath);
|
||||
} else {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
terminal: false
|
||||
});
|
||||
|
||||
let inputData = '';
|
||||
rl.on('line', (line) => {
|
||||
inputData += line + '\n';
|
||||
});
|
||||
|
||||
rl.on('close', () => {
|
||||
const inputStream = fs.createReadStream(inputData);
|
||||
convertStream(inputStream, outputFilePath);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
1072
src/markitdown/markitdown.ts
Normal file
1072
src/markitdown/markitdown.ts
Normal file
File diff suppressed because it is too large
Load diff
14
tsconfig.json
Normal file
14
tsconfig.json
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES6",
|
||||
"module": "commonjs",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src"
|
||||
},
|
||||
"include": ["src/**/*.ts"],
|
||||
"exclude": ["node_modules", "**/*.spec.ts"]
|
||||
}
|
||||
Loading…
Reference in a new issue