From 1e856c3eb68135599ca8a5466bc79253248417f0 Mon Sep 17 00:00:00 2001
From: "uratmangun.ovh" <plokotokbelajar@gmail.com>
Date: Mon, 13 Jan 2025 20:10:34 +0700
Subject: [PATCH] Make this a TypeScript SDK

Convert the project to a TypeScript SDK.

* Add `tsconfig.json` with TypeScript compiler options.
* Add `package.json` with TypeScript dependencies and build scripts.
* Add `src/markitdown/index.ts` to convert `src/markitdown/__main__.py` to TypeScript.
* Add `src/markitdown/markitdown.ts` to convert `src/markitdown/_markitdown.py` to TypeScript.
* Remove Python-specific files: `pyproject.toml`, `Dockerfile`, `src/markitdown/__main__.py`, `src/markitdown/_markitdown.py`, `src/markitdown/__init__.py`, `src/markitdown/__about__.py`, `src/markitdown/py.typed`.
* Update `README.md` to include TypeScript SDK usage instructions.
---
 Dockerfile                    |   23 -
 README.md                     |   73 +-
 package.json                  |   19 +
 pyproject.toml                |   85 --
 src/markitdown/__about__.py   |    4 -
 src/markitdown/__init__.py    |   11 -
 src/markitdown/__main__.py    |   82 --
 src/markitdown/_markitdown.py | 1657 ---------------------------------
 src/markitdown/index.ts       |   52 ++
 src/markitdown/markitdown.ts  | 1072 +++++++++++++++++++++
 src/markitdown/py.typed       |    0
 tsconfig.json                 |   14 +
 12 files changed, 1195 insertions(+), 1897 deletions(-)
 delete mode 100644 Dockerfile
 create mode 100644 package.json
 delete mode 100644 pyproject.toml
 delete mode 100644 src/markitdown/__about__.py
 delete mode 100644 src/markitdown/__init__.py
 delete mode 100644 src/markitdown/__main__.py
 delete mode 100644 src/markitdown/_markitdown.py
 create mode 100644 src/markitdown/index.ts
 create mode 100644 src/markitdown/markitdown.ts
 delete mode 100644 src/markitdown/py.typed
 create mode 100644 tsconfig.json

diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 0072d9e..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-FROM python:3.13-slim-bullseye
-
-USER root
-
-ARG INSTALL_GIT=false
-RUN if [ "$INSTALL_GIT" = "true" ]; then \
-    apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
-    fi
-
-# Runtime dependency
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    ffmpeg \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN pip install markitdown
-
-# Default USERID and GROUPID
-ARG USERID=10000
-ARG GROUPID=10000
-
-USER $USERID:$GROUPID
-
-ENTRYPOINT [ "markitdown" ]
diff --git a/README.md b/README.md
index 6bc91e6..35bd7ea 100644
--- a/README.md
+++ b/README.md
@@ -39,28 +39,28 @@ You can also pipe content:
 cat path-to-file.pdf | markitdown
 ```
 
-### Python API
+### TypeScript SDK
 
-Basic usage in Python:
+Basic usage in TypeScript:
 
-```python
-from markitdown import MarkItDown
+```typescript
+import { MarkItDown } from 'markitdown';
 
-md = MarkItDown()
-result = md.convert("test.xlsx")
-print(result.text_content)
+const md = new MarkItDown();
+const result = md.convert('test.xlsx');
+console.log(result.text_content);
 ```
 
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
 
-```python
-from markitdown import MarkItDown
-from openai import OpenAI
+```typescript
+import { MarkItDown } from 'markitdown';
+import { OpenAI } from 'openai';
 
-client = OpenAI()
-md = MarkItDown(llm_client=client, llm_model="gpt-4o")
-result = md.convert("example.jpg")
-print(result.text_content)
+const client = new OpenAI();
+const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o' });
+const result = md.convert('example.jpg');
+console.log(result.text_content);
 ```
 
 ### Docker
@@ -76,31 +76,34 @@ docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 
 
-```python convert.py
-from markitdown import MarkItDown
-from openai import OpenAI
-import os
-client = OpenAI(api_key="your-api-key-here")
-md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
-supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
-files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
-for file in files_to_convert:
-    print(f"\nConverting {file}...")
-    try:
-        md_file = os.path.splitext(file)[0] + '.md'
-        result = md.convert(file)
-        with open(md_file, 'w') as f:
-            f.write(result.text_content)
-        
-        print(f"Successfully converted {file} to {md_file}")
-    except Exception as e:
-        print(f"Error converting {file}: {str(e)}")
+```typescript
+import { MarkItDown } from 'markitdown';
+import { OpenAI } from 'openai';
+import * as fs from 'fs';
+import * as path from 'path';
 
-print("\nAll conversions completed!")
+const client = new OpenAI({ apiKey: 'your-api-key-here' });
+const md = new MarkItDown({ llm_client: client, llm_model: 'gpt-4o-2024-11-20' });
+const supportedExtensions = ['.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png'];
+const filesToConvert = fs.readdirSync('.').filter(file => supportedExtensions.includes(path.extname(file).toLowerCase()));
+
+filesToConvert.forEach(file => {
+    console.log(`\nConverting ${file}...`);
+    try {
+        const mdFile = path.basename(file, path.extname(file)) + '.md';
+        const result = md.convert(file);
+        fs.writeFileSync(mdFile, result.text_content);
+        console.log(`Successfully converted ${file} to ${mdFile}`);
+    } catch (e) {
+        console.error(`Error converting ${file}: ${e.message}`);
+    }
+});
+
+console.log('\nAll conversions completed!');
 ```
 2. Place the script in the same directory as your files
 3. Install required packages: like openai
-4. Run script ```bash python convert.py ```
+4. Run script ```bash ts-node convert.ts ```
 
 Note that original files will remain unchanged and new markdown files are created with the same base name.
 
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..f1a7bb2
--- /dev/null
+++ b/package.json
@@ -0,0 +1,19 @@
+{
+  "name": "markitdown",
+  "version": "0.0.1",
+  "description": "Utility tool for converting various files to Markdown",
+  "main": "dist/index.js",
+  "scripts": {
+    "build": "tsc",
+    "start": "node dist/index.js",
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "Adam Fourney",
+  "license": "MIT",
+  "dependencies": {
+    "axios": "^0.21.1"
+  },
+  "devDependencies": {
+    "typescript": "^4.4.3"
+  }
+}
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 9c113ad..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,85 +0,0 @@
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[project]
-name = "markitdown"
-dynamic = ["version"]
-description = 'Utility tool for converting various files to Markdown'
-readme = "README.md"
-requires-python = ">=3.10"
-license = "MIT"
-keywords = []
-authors = [
-  { name = "Adam Fourney", email = "adamfo@microsoft.com" },
-]
-classifiers = [
-  "Development Status :: 4 - Beta",
-  "Programming Language :: Python",
-  "Programming Language :: Python :: 3.10",
-  "Programming Language :: Python :: 3.11",
-  "Programming Language :: Python :: 3.12",
-  "Programming Language :: Python :: 3.13",
-  "Programming Language :: Python :: Implementation :: CPython",
-  "Programming Language :: Python :: Implementation :: PyPy",
-]
-dependencies = [
-  "beautifulsoup4",
-  "requests",
-  "mammoth",
-  "markdownify",
-  "numpy",
-  "python-pptx",
-  "pandas",
-  "openpyxl",
-  "xlrd",
-  "pdfminer.six",
-  "puremagic",
-  "pydub",
-  "olefile",
-  "youtube-transcript-api",
-  "SpeechRecognition",
-  "pathvalidate",
-  "charset-normalizer",
-  "openai",
-]
-
-[project.urls]
-Documentation = "https://github.com/microsoft/markitdown#readme"
-Issues = "https://github.com/microsoft/markitdown/issues"
-Source = "https://github.com/microsoft/markitdown"
-
-[tool.hatch.version]
-path = "src/markitdown/__about__.py"
-
-[project.scripts]
-markitdown = "markitdown.__main__:main"
-
-[tool.hatch.envs.types]
-extra-dependencies = [
-  "mypy>=1.0.0",
-]
-[tool.hatch.envs.types.scripts]
-check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
-
-[tool.coverage.run]
-source_pkgs = ["markitdown", "tests"]
-branch = true
-parallel = true
-omit = [
-  "src/markitdown/__about__.py",
-]
-
-[tool.coverage.paths]
-markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
-tests = ["tests", "*/markitdown/tests"]
-
-[tool.coverage.report]
-exclude_lines = [
-  "no cov",
-  "if __name__ == .__main__.:",
-  "if TYPE_CHECKING:",
-]
-
-[tool.hatch.build.targets.sdist]
-only-include = ["src/markitdown"]
diff --git a/src/markitdown/__about__.py b/src/markitdown/__about__.py
deleted file mode 100644
index a365900..0000000
--- a/src/markitdown/__about__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
-#
-# SPDX-License-Identifier: MIT
-__version__ = "0.0.1a3"
diff --git a/src/markitdown/__init__.py b/src/markitdown/__init__.py
deleted file mode 100644
index 482f428..0000000
--- a/src/markitdown/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
-#
-# SPDX-License-Identifier: MIT
-
-from ._markitdown import MarkItDown, FileConversionException, UnsupportedFormatException
-
-__all__ = [
-    "MarkItDown",
-    "FileConversionException",
-    "UnsupportedFormatException",
-]
diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py
deleted file mode 100644
index b6cf963..0000000
--- a/src/markitdown/__main__.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
-#
-# SPDX-License-Identifier: MIT
-import argparse
-import sys
-from textwrap import dedent
-from .__about__ import __version__
-from ._markitdown import MarkItDown, DocumentConverterResult
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert various file formats to markdown.",
-        prog="markitdown",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        usage=dedent(
-            """
-            SYNTAX:
-
-                markitdown <OPTIONAL: FILENAME>
-                If FILENAME is empty, markitdown reads from stdin.
-
-            EXAMPLE:
-
-                markitdown example.pdf
-
-                OR
-
-                cat example.pdf | markitdown
-
-                OR
-
-                markitdown < example.pdf
-                
-                OR to save to a file use
-    
-                markitdown example.pdf -o example.md
-                
-                OR
-                
-                markitdown example.pdf > example.md
-            """
-        ).strip(),
-    )
-
-    parser.add_argument(
-        "-v",
-        "--version",
-        action="version",
-        version=f"%(prog)s {__version__}",
-        help="show the version number and exit",
-    )
-
-    parser.add_argument("filename", nargs="?")
-    parser.add_argument(
-        "-o",
-        "--output",
-        help="Output file name. If not provided, output is written to stdout.",
-    )
-    args = parser.parse_args()
-
-    if args.filename is None:
-        markitdown = MarkItDown()
-        result = markitdown.convert_stream(sys.stdin.buffer)
-        _handle_output(args, result)
-    else:
-        markitdown = MarkItDown()
-        result = markitdown.convert(args.filename)
-        _handle_output(args, result)
-
-
-def _handle_output(args, result: DocumentConverterResult):
-    """Handle output to stdout or file"""
-    if args.output:
-        with open(args.output, "w", encoding="utf-8") as f:
-            f.write(result.text_content)
-    else:
-        print(result.text_content)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
deleted file mode 100644
index 33806e1..0000000
--- a/src/markitdown/_markitdown.py
+++ /dev/null
@@ -1,1657 +0,0 @@
-# type: ignore
-import base64
-import binascii
-import copy
-import html
-import json
-import mimetypes
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-import traceback
-import zipfile
-from xml.dom import minidom
-from typing import Any, Dict, List, Optional, Union
-from pathlib import Path
-from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
-from warnings import warn, resetwarnings, catch_warnings
-
-import mammoth
-import markdownify
-import olefile
-import pandas as pd
-import pdfminer
-import pdfminer.high_level
-import pptx
-
-# File-format detection
-import puremagic
-import requests
-from bs4 import BeautifulSoup
-from charset_normalizer import from_path
-
-# Optional Transcription support
-IS_AUDIO_TRANSCRIPTION_CAPABLE = False
-try:
-    # Using warnings' catch_warnings to catch
-    # pydub's warning of ffmpeg or avconv missing
-    with catch_warnings(record=True) as w:
-        import pydub
-
-        if w:
-            raise ModuleNotFoundError
-    import speech_recognition as sr
-
-    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
-except ModuleNotFoundError:
-    pass
-finally:
-    resetwarnings()
-
-# Optional YouTube transcription support
-try:
-    from youtube_transcript_api import YouTubeTranscriptApi
-
-    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
-except ModuleNotFoundError:
-    pass
-
-
-class _CustomMarkdownify(markdownify.MarkdownConverter):
-    """
-    A custom version of markdownify's MarkdownConverter. Changes include:
-
-    - Altering the default heading style to use '#', '##', etc.
-    - Removing javascript hyperlinks.
-    - Truncating images with large data:uri sources.
-    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
-    """
-
-    def __init__(self, **options: Any):
-        options["heading_style"] = options.get("heading_style", markdownify.ATX)
-        # Explicitly cast options to the expected type if necessary
-        super().__init__(**options)
-
-    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
-        """Same as usual, but be sure to start with a new line"""
-        if not convert_as_inline:
-            if not re.search(r"^\n", text):
-                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
-
-        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
-
-    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
-        """Same as usual converter, but removes Javascript links and escapes URIs."""
-        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
-        if not text:
-            return ""
-        href = el.get("href")
-        title = el.get("title")
-
-        # Escape URIs and skip non-http or file schemes
-        if href:
-            try:
-                parsed_url = urlparse(href)  # type: ignore
-                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
-                    return "%s%s%s" % (prefix, text, suffix)
-                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
-            except ValueError:  # It's not clear if this ever gets thrown
-                return "%s%s%s" % (prefix, text, suffix)
-
-        # For the replacement see #29: text nodes underscores are escaped
-        if (
-            self.options["autolinks"]
-            and text.replace(r"\_", "_") == href
-            and not title
-            and not self.options["default_title"]
-        ):
-            # Shortcut syntax
-            return "<%s>" % href
-        if self.options["default_title"] and not title:
-            title = href
-        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        return (
-            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
-            if href
-            else text
-        )
-
-    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
-        """Same as usual converter, but removes data URIs"""
-
-        alt = el.attrs.get("alt", None) or ""
-        src = el.attrs.get("src", None) or ""
-        title = el.attrs.get("title", None) or ""
-        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        if (
-            convert_as_inline
-            and el.parent.name not in self.options["keep_inline_images_in"]
-        ):
-            return alt
-
-        # Remove dataURIs
-        if src.startswith("data:"):
-            src = src.split(",")[0] + "..."
-
-        return "![%s](%s%s)" % (alt, src, title_part)
-
-    def convert_soup(self, soup: Any) -> str:
-        return super().convert_soup(soup)  # type: ignore
-
-
-class DocumentConverterResult:
-    """The result of converting a document to text."""
-
-    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
-        self.title: Union[str, None] = title
-        self.text_content: str = text_content
-
-
-class DocumentConverter:
-    """Abstract superclass of all DocumentConverters."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        raise NotImplementedError()
-
-
-class PlainTextConverter(DocumentConverter):
-    """Anything with content type text/plain"""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Guess the content type from any file extension that might be around
-        content_type, _ = mimetypes.guess_type(
-            "__placeholder" + kwargs.get("file_extension", "")
-        )
-
-        # Only accept text files
-        if content_type is None:
-            return None
-        elif all(
-            not content_type.lower().startswith(type_prefix)
-            for type_prefix in ["text/", "application/json"]
-        ):
-            return None
-
-        text_content = str(from_path(local_path).best())
-        return DocumentConverterResult(
-            title=None,
-            text_content=text_content,
-        )
-
-
-class HtmlConverter(DocumentConverter):
-    """Anything with content type text/html"""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not html
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-
-        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            result = self._convert(fh.read())
-
-        return result
-
-    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts and HTML string."""
-
-        # Parse the string
-        soup = BeautifulSoup(html_content, "html.parser")
-
-        # Remove javascript and style blocks
-        for script in soup(["script", "style"]):
-            script.extract()
-
-        # Print only the main content
-        body_elm = soup.find("body")
-        webpage_text = ""
-        if body_elm:
-            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
-        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
-
-        assert isinstance(webpage_text, str)
-
-        return DocumentConverterResult(
-            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
-        )
-
-
-class RSSConverter(DocumentConverter):
-    """Convert RSS / Atom type to markdown"""
-
-    def convert(
-        self, local_path: str, **kwargs
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not RSS type
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".xml", ".rss", ".atom"]:
-            return None
-        try:
-            doc = minidom.parse(local_path)
-        except BaseException as _:
-            return None
-        result = None
-        if doc.getElementsByTagName("rss"):
-            # A RSS feed must have a root element of <rss>
-            result = self._parse_rss_type(doc)
-        elif doc.getElementsByTagName("feed"):
-            root = doc.getElementsByTagName("feed")[0]
-            if root.getElementsByTagName("entry"):
-                # An Atom feed must have a root element of <feed> and at least one <entry>
-                result = self._parse_atom_type(doc)
-            else:
-                return None
-        else:
-            # not rss or atom
-            return None
-
-        return result
-
-    def _parse_atom_type(
-        self, doc: minidom.Document
-    ) -> Union[None, DocumentConverterResult]:
-        """Parse the type of an Atom feed.
-
-        Returns None if the feed type is not recognized or something goes wrong.
-        """
-        try:
-            root = doc.getElementsByTagName("feed")[0]
-            title = self._get_data_by_tag_name(root, "title")
-            subtitle = self._get_data_by_tag_name(root, "subtitle")
-            entries = root.getElementsByTagName("entry")
-            md_text = f"# {title}\n"
-            if subtitle:
-                md_text += f"{subtitle}\n"
-            for entry in entries:
-                entry_title = self._get_data_by_tag_name(entry, "title")
-                entry_summary = self._get_data_by_tag_name(entry, "summary")
-                entry_updated = self._get_data_by_tag_name(entry, "updated")
-                entry_content = self._get_data_by_tag_name(entry, "content")
-
-                if entry_title:
-                    md_text += f"\n## {entry_title}\n"
-                if entry_updated:
-                    md_text += f"Updated on: {entry_updated}\n"
-                if entry_summary:
-                    md_text += self._parse_content(entry_summary)
-                if entry_content:
-                    md_text += self._parse_content(entry_content)
-
-            return DocumentConverterResult(
-                title=title,
-                text_content=md_text,
-            )
-        except BaseException as _:
-            return None
-
-    def _parse_rss_type(
-        self, doc: minidom.Document
-    ) -> Union[None, DocumentConverterResult]:
-        """Parse the type of an RSS feed.
-
-        Returns None if the feed type is not recognized or something goes wrong.
-        """
-        try:
-            root = doc.getElementsByTagName("rss")[0]
-            channel = root.getElementsByTagName("channel")
-            if not channel:
-                return None
-            channel = channel[0]
-            channel_title = self._get_data_by_tag_name(channel, "title")
-            channel_description = self._get_data_by_tag_name(channel, "description")
-            items = channel.getElementsByTagName("item")
-            if channel_title:
-                md_text = f"# {channel_title}\n"
-            if channel_description:
-                md_text += f"{channel_description}\n"
-            if not items:
-                items = []
-            for item in items:
-                title = self._get_data_by_tag_name(item, "title")
-                description = self._get_data_by_tag_name(item, "description")
-                pubDate = self._get_data_by_tag_name(item, "pubDate")
-                content = self._get_data_by_tag_name(item, "content:encoded")
-
-                if title:
-                    md_text += f"\n## {title}\n"
-                if pubDate:
-                    md_text += f"Published on: {pubDate}\n"
-                if description:
-                    md_text += self._parse_content(description)
-                if content:
-                    md_text += self._parse_content(content)
-
-            return DocumentConverterResult(
-                title=channel_title,
-                text_content=md_text,
-            )
-        except BaseException as _:
-            print(traceback.format_exc())
-            return None
-
-    def _parse_content(self, content: str) -> str:
-        """Parse the content of an RSS feed item"""
-        try:
-            # using bs4 because many RSS feeds have HTML-styled content
-            soup = BeautifulSoup(content, "html.parser")
-            return _CustomMarkdownify().convert_soup(soup)
-        except BaseException as _:
-            return content
-
-    def _get_data_by_tag_name(
-        self, element: minidom.Element, tag_name: str
-    ) -> Union[str, None]:
-        """Get data from first child element with the given tag name.
-        Returns None when no such element is found.
-        """
-        nodes = element.getElementsByTagName(tag_name)
-        if not nodes:
-            return None
-        fc = nodes[0].firstChild
-        if fc:
-            return fc.data
-        return None
-
-
-class WikipediaConverter(DocumentConverter):
-    """Handle Wikipedia pages separately, focusing only on the main document content."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not Wikipedia
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
-            return None
-
-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
-
-        # Remove javascript and style blocks
-        for script in soup(["script", "style"]):
-            script.extract()
-
-        # Print only the main content
-        body_elm = soup.find("div", {"id": "mw-content-text"})
-        title_elm = soup.find("span", {"class": "mw-page-title-main"})
-
-        webpage_text = ""
-        main_title = None if soup.title is None else soup.title.string
-
-        if body_elm:
-            # What's the title
-            if title_elm and len(title_elm) > 0:
-                main_title = title_elm.string  # type: ignore
-                assert isinstance(main_title, str)
-
-            # Convert the page
-            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
-                body_elm
-            )
-        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
-
-        return DocumentConverterResult(
-            title=main_title,
-            text_content=webpage_text,
-        )
-
-
-class YouTubeConverter(DocumentConverter):
-    """Handle YouTube specially, focusing on the video title, description, and transcript."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not YouTube
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not url.startswith("https://www.youtube.com/watch?"):
-            return None
-
-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
-
-        # Read the meta tags
-        assert soup.title is not None and soup.title.string is not None
-        metadata: Dict[str, str] = {"title": soup.title.string}
-        for meta in soup(["meta"]):
-            for a in meta.attrs:
-                if a in ["itemprop", "property", "name"]:
-                    metadata[meta[a]] = meta.get("content", "")
-                    break
-
-        # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
-        try:
-            for script in soup(["script"]):
-                content = script.text
-                if "ytInitialData" in content:
-                    lines = re.split(r"\r?\n", content)
-                    obj_start = lines[0].find("{")
-                    obj_end = lines[0].rfind("}")
-                    if obj_start >= 0 and obj_end >= 0:
-                        data = json.loads(lines[0][obj_start : obj_end + 1])
-                        attrdesc = self._findKey(data, "attributedDescriptionBodyText")  # type: ignore
-                        if attrdesc:
-                            metadata["description"] = str(attrdesc["content"])
-                    break
-        except Exception:
-            pass
-
-        # Start preparing the page
-        webpage_text = "# YouTube\n"
-
-        title = self._get(metadata, ["title", "og:title", "name"])  # type: ignore
-        assert isinstance(title, str)
-
-        if title:
-            webpage_text += f"\n## {title}\n"
-
-        stats = ""
-        views = self._get(metadata, ["interactionCount"])  # type: ignore
-        if views:
-            stats += f"- **Views:** {views}\n"
-
-        keywords = self._get(metadata, ["keywords"])  # type: ignore
-        if keywords:
-            stats += f"- **Keywords:** {keywords}\n"
-
-        runtime = self._get(metadata, ["duration"])  # type: ignore
-        if runtime:
-            stats += f"- **Runtime:** {runtime}\n"
-
-        if len(stats) > 0:
-            webpage_text += f"\n### Video Metadata\n{stats}\n"
-
-        description = self._get(metadata, ["description", "og:description"])  # type: ignore
-        if description:
-            webpage_text += f"\n### Description\n{description}\n"
-
-        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
-            transcript_text = ""
-            parsed_url = urlparse(url)  # type: ignore
-            params = parse_qs(parsed_url.query)  # type: ignore
-            if "v" in params:
-                assert isinstance(params["v"][0], str)
-                video_id = str(params["v"][0])
-                try:
-                    youtube_transcript_languages = kwargs.get(
-                        "youtube_transcript_languages", ("en",)
-                    )
-                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
-                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
-                    # Alternative formatting:
-                    # formatter = TextFormatter()
-                    # formatter.format_transcript(transcript)
-                except Exception:
-                    pass
-            if transcript_text:
-                webpage_text += f"\n### Transcript\n{transcript_text}\n"
-
-        title = title if title else soup.title.string
-        assert isinstance(title, str)
-
-        return DocumentConverterResult(
-            title=title,
-            text_content=webpage_text,
-        )
-
-    def _get(
-        self,
-        metadata: Dict[str, str],
-        keys: List[str],
-        default: Union[str, None] = None,
-    ) -> Union[str, None]:
-        for k in keys:
-            if k in metadata:
-                return metadata[k]
-        return default
-
-    def _findKey(self, json: Any, key: str) -> Union[str, None]:  # TODO: Fix json type
-        if isinstance(json, list):
-            for elm in json:
-                ret = self._findKey(elm, key)
-                if ret is not None:
-                    return ret
-        elif isinstance(json, dict):
-            for k in json:
-                if k == key:
-                    return json[k]
-                else:
-                    ret = self._findKey(json[k], key)
-                    if ret is not None:
-                        return ret
-        return None
-
-
-class IpynbConverter(DocumentConverter):
-    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not ipynb
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".ipynb":
-            return None
-
-        # Parse and convert the notebook
-        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            notebook_content = json.load(fh)
-            result = self._convert(notebook_content)
-
-        return result
-
-    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts notebook JSON content to Markdown."""
-        try:
-            md_output = []
-            title = None
-
-            for cell in notebook_content.get("cells", []):
-                cell_type = cell.get("cell_type", "")
-                source_lines = cell.get("source", [])
-
-                if cell_type == "markdown":
-                    md_output.append("".join(source_lines))
-
-                    # Extract the first # heading as title if not already found
-                    if title is None:
-                        for line in source_lines:
-                            if line.startswith("# "):
-                                title = line.lstrip("# ").strip()
-                                break
-
-                elif cell_type == "code":
-                    # Code cells are wrapped in Markdown code blocks
-                    md_output.append(f"```python\n{''.join(source_lines)}\n```")
-                elif cell_type == "raw":
-                    md_output.append(f"```\n{''.join(source_lines)}\n```")
-
-            md_text = "\n\n".join(md_output)
-
-            # Check for title in notebook metadata
-            title = notebook_content.get("metadata", {}).get("title", title)
-
-            return DocumentConverterResult(
-                title=title,
-                text_content=md_text,
-            )
-
-        except Exception as e:
-            raise FileConversionException(
-                f"Error converting .ipynb file: {str(e)}"
-            ) from e
-
-
-class BingSerpConverter(DocumentConverter):
-    """
-    Handle Bing results pages (only the organic search results).
-    NOTE: It is better to use the Bing API
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a Bing SERP
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".html", ".htm"]:
-            return None
-        url = kwargs.get("url", "")
-        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
-            return None
-
-        # Parse the query parameters
-        parsed_params = parse_qs(urlparse(url).query)
-        query = parsed_params.get("q", [""])[0]
-
-        # Parse the file
-        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            soup = BeautifulSoup(fh.read(), "html.parser")
-
-        # Clean up some formatting
-        for tptt in soup.find_all(class_="tptt"):
-            if hasattr(tptt, "string") and tptt.string:
-                tptt.string += " "
-        for slug in soup.find_all(class_="algoSlug_icon"):
-            slug.extract()
-
-        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify()
-        results = list()
-        for result in soup.find_all(class_="b_algo"):
-            # Rewrite redirect urls
-            for a in result.find_all("a", href=True):
-                parsed_href = urlparse(a["href"])
-                qs = parse_qs(parsed_href.query)
-
-                # The destination is contained in the u parameter,
-                # but appears to be base64 encoded, with some prefix
-                if "u" in qs:
-                    u = (
-                        qs["u"][0][2:].strip() + "=="
-                    )  # Python 3 doesn't care about extra padding
-
-                    try:
-                        # RFC 4648 / Base64URL" variant, which uses "-" and "_"
-                        a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
-                    except UnicodeDecodeError:
-                        pass
-                    except binascii.Error:
-                        pass
-
-            # Convert to markdown
-            md_result = _markdownify.convert_soup(result).strip()
-            lines = [line.strip() for line in re.split(r"\n+", md_result)]
-            results.append("\n".join([line for line in lines if len(line) > 0]))
-
-        webpage_text = (
-            f"## A Bing search for '{query}' found the following results:\n\n"
-            + "\n\n".join(results)
-        )
-
-        return DocumentConverterResult(
-            title=None if soup.title is None else soup.title.string,
-            text_content=webpage_text,
-        )
-
-
-class PdfConverter(DocumentConverter):
-    """
-    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a PDF
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".pdf":
-            return None
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
-        )
-
-
-class DocxConverter(HtmlConverter):
-    """
-    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a DOCX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".docx":
-            return None
-
-        result = None
-        with open(local_path, "rb") as docx_file:
-            style_map = kwargs.get("style_map", None)
-
-            result = mammoth.convert_to_html(docx_file, style_map=style_map)
-            html_content = result.value
-            result = self._convert(html_content)
-
-        return result
-
-
-class XlsxConverter(HtmlConverter):
-    """
-    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLSX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".xlsx":
-            return None
-
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
-        md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += self._convert(html_content).text_content.strip() + "\n\n"
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-
-class XlsConverter(HtmlConverter):
-    """
-    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLS
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".xls":
-            return None
-
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
-        md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += self._convert(html_content).text_content.strip() + "\n\n"
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-
-class PptxConverter(HtmlConverter):
-    """
-    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a PPTX
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".pptx":
-            return None
-
-        md_content = ""
-
-        presentation = pptx.Presentation(local_path)
-        slide_num = 0
-        for slide in presentation.slides:
-            slide_num += 1
-
-            md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
-
-            title = slide.shapes.title
-            for shape in slide.shapes:
-                # Pictures
-                if self._is_picture(shape):
-                    # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
-                    alt_text = ""
-                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
-                    except Exception:
-                        pass
-
-                    # A placeholder name
-                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                    md_content += (
-                        "\n!["
-                        + (alt_text if alt_text else shape.name)
-                        + "]("
-                        + filename
-                        + ")\n"
-                    )
-
-                # Tables
-                if self._is_table(shape):
-                    html_table = "<html><body><table>"
-                    first_row = True
-                    for row in shape.table.rows:
-                        html_table += "<tr>"
-                        for cell in row.cells:
-                            if first_row:
-                                html_table += "<th>" + html.escape(cell.text) + "</th>"
-                            else:
-                                html_table += "<td>" + html.escape(cell.text) + "</td>"
-                        html_table += "</tr>"
-                        first_row = False
-                    html_table += "</table></body></html>"
-                    md_content += (
-                        "\n" + self._convert(html_table).text_content.strip() + "\n"
-                    )
-
-                # Charts
-                if shape.has_chart:
-                    md_content += self._convert_chart_to_markdown(shape.chart)
-
-                # Text areas
-                elif shape.has_text_frame:
-                    if shape == title:
-                        md_content += "# " + shape.text.lstrip() + "\n"
-                    else:
-                        md_content += shape.text + "\n"
-
-            md_content = md_content.strip()
-
-            if slide.has_notes_slide:
-                md_content += "\n\n### Notes:\n"
-                notes_frame = slide.notes_slide.notes_text_frame
-                if notes_frame is not None:
-                    md_content += notes_frame.text
-                md_content = md_content.strip()
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-    def _is_picture(self, shape):
-        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
-            return True
-        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
-            if hasattr(shape, "image"):
-                return True
-        return False
-
-    def _is_table(self, shape):
-        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
-            return True
-        return False
-
-    def _convert_chart_to_markdown(self, chart):
-        md = "\n\n### Chart"
-        if chart.has_title:
-            md += f": {chart.chart_title.text_frame.text}"
-        md += "\n\n"
-        data = []
-        category_names = [c.label for c in chart.plots[0].categories]
-        series_names = [s.name for s in chart.series]
-        data.append(["Category"] + series_names)
-
-        for idx, category in enumerate(category_names):
-            row = [category]
-            for series in chart.series:
-                row.append(series.values[idx])
-            data.append(row)
-
-        markdown_table = []
-        for row in data:
-            markdown_table.append("| " + " | ".join(map(str, row)) + " |")
-        header = markdown_table[0]
-        separator = "|" + "|".join(["---"] * len(data[0])) + "|"
-        return md + "\n".join([header, separator] + markdown_table[1:])
-
-
-class MediaConverter(DocumentConverter):
-    """
-    Abstract class for multi-modal media (e.g., images and audio)
-    """
-
-    def _get_metadata(self, local_path, exiftool_path=None):
-        if not exiftool_path:
-            which_exiftool = shutil.which("exiftool")
-            if which_exiftool:
-                warn(
-                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
-
-    md = MarkItDown(exiftool_path="{which_exiftool}")
-
-This warning will be removed in future releases.
-""",
-                    DeprecationWarning,
-                )
-
-            return None
-        else:
-            try:
-                result = subprocess.run(
-                    [exiftool_path, "-json", local_path], capture_output=True, text=True
-                ).stdout
-                return json.loads(result)[0]
-            except Exception:
-                return None
-
-
-class WavConverter(MediaConverter):
-    """
-    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a WAV
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".wav":
-            return None
-
-        md_content = ""
-
-        # Add metadata
-        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
-        if metadata:
-            for f in [
-                "Title",
-                "Artist",
-                "Author",
-                "Band",
-                "Album",
-                "Genre",
-                "Track",
-                "DateTimeOriginal",
-                "CreateDate",
-                "Duration",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Transcribe
-        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
-            try:
-                transcript = self._transcribe_audio(local_path)
-                md_content += "\n\n### Audio Transcript:\n" + (
-                    "[No speech detected]" if transcript == "" else transcript
-                )
-            except Exception:
-                md_content += (
-                    "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
-                )
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-    def _transcribe_audio(self, local_path) -> str:
-        recognizer = sr.Recognizer()
-        with sr.AudioFile(local_path) as source:
-            audio = recognizer.record(source)
-            return recognizer.recognize_google(audio).strip()
-
-
-class Mp3Converter(WavConverter):
-    """
-    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a MP3
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".mp3":
-            return None
-
-        md_content = ""
-
-        # Add metadata
-        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
-        if metadata:
-            for f in [
-                "Title",
-                "Artist",
-                "Author",
-                "Band",
-                "Album",
-                "Genre",
-                "Track",
-                "DateTimeOriginal",
-                "CreateDate",
-                "Duration",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Transcribe
-        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
-            handle, temp_path = tempfile.mkstemp(suffix=".wav")
-            os.close(handle)
-            try:
-                sound = pydub.AudioSegment.from_mp3(local_path)
-                sound.export(temp_path, format="wav")
-
-                _args = dict()
-                _args.update(kwargs)
-                _args["file_extension"] = ".wav"
-
-                try:
-                    transcript = super()._transcribe_audio(temp_path).strip()
-                    md_content += "\n\n### Audio Transcript:\n" + (
-                        "[No speech detected]" if transcript == "" else transcript
-                    )
-                except Exception:
-                    md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
-
-            finally:
-                os.unlink(temp_path)
-
-        # Return the result
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content.strip(),
-        )
-
-
-class ImageConverter(MediaConverter):
-    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
-    """
-
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not an image
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
-            return None
-
-        md_content = ""
-
-        # Add metadata
-        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
-        if metadata:
-            for f in [
-                "ImageSize",
-                "Title",
-                "Caption",
-                "Description",
-                "Keywords",
-                "Artist",
-                "Author",
-                "DateTimeOriginal",
-                "CreateDate",
-                "GPSPosition",
-            ]:
-                if f in metadata:
-                    md_content += f"{f}: {metadata[f]}\n"
-
-        # Try describing the image with GPTV
-        llm_client = kwargs.get("llm_client")
-        llm_model = kwargs.get("llm_model")
-        if llm_client is not None and llm_model is not None:
-            md_content += (
-                "\n# Description:\n"
-                + self._get_llm_description(
-                    local_path,
-                    extension,
-                    llm_client,
-                    llm_model,
-                    prompt=kwargs.get("llm_prompt"),
-                ).strip()
-                + "\n"
-            )
-
-        return DocumentConverterResult(
-            title=None,
-            text_content=md_content,
-        )
-
-    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
-        if prompt is None or prompt.strip() == "":
-            prompt = "Write a detailed caption for this image."
-
-        data_uri = ""
-        with open(local_path, "rb") as image_file:
-            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
-            if content_type is None:
-                content_type = "image/jpeg"
-            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
-            data_uri = f"data:{content_type};base64,{image_base64}"
-
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": data_uri,
-                        },
-                    },
-                ],
-            }
-        ]
-
-        response = client.chat.completions.create(model=model, messages=messages)
-        return response.choices[0].message.content
-
-
-class OutlookMsgConverter(DocumentConverter):
-    """Converts Outlook .msg files to markdown by extracting email metadata and content.
-
-    Uses the olefile package to parse the .msg file structure and extract:
-    - Email headers (From, To, Subject)
-    - Email body content
-    """
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not a MSG file
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".msg":
-            return None
-
-        try:
-            msg = olefile.OleFileIO(local_path)
-            # Extract email metadata
-            md_content = "# Email Message\n\n"
-
-            # Get headers
-            headers = {
-                "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
-                "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
-                "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
-            }
-
-            # Add headers to markdown
-            for key, value in headers.items():
-                if value:
-                    md_content += f"**{key}:** {value}\n"
-
-            md_content += "\n## Content\n\n"
-
-            # Get email body
-            body = self._get_stream_data(msg, "__substg1.0_1000001F")
-            if body:
-                md_content += body
-
-            msg.close()
-
-            return DocumentConverterResult(
-                title=headers.get("Subject"), text_content=md_content.strip()
-            )
-
-        except Exception as e:
-            raise FileConversionException(
-                f"Could not convert MSG file '{local_path}': {str(e)}"
-            )
-
-    def _get_stream_data(
-        self, msg: olefile.OleFileIO, stream_path: str
-    ) -> Union[str, None]:
-        """Helper to safely extract and decode stream data from the MSG file."""
-        try:
-            if msg.exists(stream_path):
-                data = msg.openstream(stream_path).read()
-                # Try UTF-16 first (common for .msg files)
-                try:
-                    return data.decode("utf-16-le").strip()
-                except UnicodeDecodeError:
-                    # Fall back to UTF-8
-                    try:
-                        return data.decode("utf-8").strip()
-                    except UnicodeDecodeError:
-                        # Last resort - ignore errors
-                        return data.decode("utf-8", errors="ignore").strip()
-        except Exception:
-            pass
-        return None
-
-
-class ZipConverter(DocumentConverter):
-    """Converts ZIP files to markdown by extracting and converting all contained files.
-
-    The converter extracts the ZIP contents to a temporary directory, processes each file
-    using appropriate converters based on file extensions, and then combines the results
-    into a single markdown document. The temporary directory is cleaned up after processing.
-
-    Example output format:
-    ```markdown
-    Content from the zip file `example.zip`:
-
-    ## File: docs/readme.txt
-
-    This is the content of readme.txt
-    Multiple lines are preserved
-
-    ## File: images/example.jpg
-
-    ImageSize: 1920x1080
-    DateTimeOriginal: 2024-02-15 14:30:00
-    Description: A beautiful landscape photo
-
-    ## File: data/report.xlsx
-
-    ## Sheet1
-    | Column1 | Column2 | Column3 |
-    |---------|---------|---------|
-    | data1   | data2   | data3   |
-    | data4   | data5   | data6   |
-    ```
-
-    Key features:
-    - Maintains original file structure in headings
-    - Processes nested files recursively
-    - Uses appropriate converters for each file type
-    - Preserves formatting of converted content
-    - Cleans up temporary files after processing
-    """
-
-    def convert(
-        self, local_path: str, **kwargs: Any
-    ) -> Union[None, DocumentConverterResult]:
-        # Bail if not a ZIP
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".zip":
-            return None
-
-        # Get parent converters list if available
-        parent_converters = kwargs.get("_parent_converters", [])
-        if not parent_converters:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
-            )
-
-        extracted_zip_folder_name = (
-            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
-        )
-        extraction_dir = os.path.normpath(
-            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
-        )
-        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
-
-        try:
-            # Extract the zip file safely
-            with zipfile.ZipFile(local_path, "r") as zipObj:
-                # Safeguard against path traversal
-                for member in zipObj.namelist():
-                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
-                    if (
-                        not os.path.commonprefix([extraction_dir, member_path])
-                        == extraction_dir
-                    ):
-                        raise ValueError(
-                            f"Path traversal detected in zip file: {member}"
-                        )
-
-                # Extract all files safely
-                zipObj.extractall(path=extraction_dir)
-
-            # Process each extracted file
-            for root, dirs, files in os.walk(extraction_dir):
-                for name in files:
-                    file_path = os.path.join(root, name)
-                    relative_path = os.path.relpath(file_path, extraction_dir)
-
-                    # Get file extension
-                    _, file_extension = os.path.splitext(name)
-
-                    # Update kwargs for the file
-                    file_kwargs = kwargs.copy()
-                    file_kwargs["file_extension"] = file_extension
-                    file_kwargs["_parent_converters"] = parent_converters
-
-                    # Try converting the file using available converters
-                    for converter in parent_converters:
-                        # Skip the zip converter to avoid infinite recursion
-                        if isinstance(converter, ZipConverter):
-                            continue
-
-                        result = converter.convert(file_path, **file_kwargs)
-                        if result is not None:
-                            md_content += f"\n## File: {relative_path}\n\n"
-                            md_content += result.text_content + "\n\n"
-                            break
-
-            # Clean up extracted files if specified
-            if kwargs.get("cleanup_extracted", True):
-                shutil.rmtree(extraction_dir)
-
-            return DocumentConverterResult(title=None, text_content=md_content.strip())
-
-        except zipfile.BadZipFile:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
-            )
-        except ValueError as ve:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
-            )
-        except Exception as e:
-            return DocumentConverterResult(
-                title=None,
-                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
-            )
-
-
-class FileConversionException(BaseException):
-    pass
-
-
-class UnsupportedFormatException(BaseException):
-    pass
-
-
-class MarkItDown:
-    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
-    This reader will convert common file-types or webpages to Markdown."""
-
-    def __init__(
-        self,
-        requests_session: Optional[requests.Session] = None,
-        llm_client: Optional[Any] = None,
-        llm_model: Optional[str] = None,
-        style_map: Optional[str] = None,
-        exiftool_path: Optional[str] = None,
-        # Deprecated
-        mlm_client: Optional[Any] = None,
-        mlm_model: Optional[str] = None,
-    ):
-        if requests_session is None:
-            self._requests_session = requests.Session()
-        else:
-            self._requests_session = requests_session
-
-        if exiftool_path is None:
-            exiftool_path = os.environ.get("EXIFTOOL_PATH")
-
-        # Handle deprecation notices
-        #############################
-        if mlm_client is not None:
-            if llm_client is None:
-                warn(
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
-                    DeprecationWarning,
-                )
-                llm_client = mlm_client
-                mlm_client = None
-            else:
-                raise ValueError(
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
-                )
-
-        if mlm_model is not None:
-            if llm_model is None:
-                warn(
-                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
-                    DeprecationWarning,
-                )
-                llm_model = mlm_model
-                mlm_model = None
-            else:
-                raise ValueError(
-                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
-                )
-        #############################
-
-        self._llm_client = llm_client
-        self._llm_model = llm_model
-        self._style_map = style_map
-        self._exiftool_path = exiftool_path
-
-        self._page_converters: List[DocumentConverter] = []
-
-        # Register converters for successful browsing operations
-        # Later registrations are tried first / take higher priority than earlier registrations
-        # To this end, the most specific converters should appear below the most generic converters
-        self.register_page_converter(PlainTextConverter())
-        self.register_page_converter(HtmlConverter())
-        self.register_page_converter(RSSConverter())
-        self.register_page_converter(WikipediaConverter())
-        self.register_page_converter(YouTubeConverter())
-        self.register_page_converter(BingSerpConverter())
-        self.register_page_converter(DocxConverter())
-        self.register_page_converter(XlsxConverter())
-        self.register_page_converter(XlsConverter())
-        self.register_page_converter(PptxConverter())
-        self.register_page_converter(WavConverter())
-        self.register_page_converter(Mp3Converter())
-        self.register_page_converter(ImageConverter())
-        self.register_page_converter(IpynbConverter())
-        self.register_page_converter(PdfConverter())
-        self.register_page_converter(ZipConverter())
-        self.register_page_converter(OutlookMsgConverter())
-
-    def convert(
-        self, source: Union[str, requests.Response, Path], **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
-        """
-        Args:
-            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
-            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
-        """
-
-        # Local path or url
-        if isinstance(source, str):
-            if (
-                source.startswith("http://")
-                or source.startswith("https://")
-                or source.startswith("file://")
-            ):
-                return self.convert_url(source, **kwargs)
-            else:
-                return self.convert_local(source, **kwargs)
-        # Request response
-        elif isinstance(source, requests.Response):
-            return self.convert_response(source, **kwargs)
-        elif isinstance(source, Path):
-            return self.convert_local(source, **kwargs)
-
-    def convert_local(
-        self, path: Union[str, Path], **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
-        if isinstance(path, Path):
-            path = str(path)
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
-
-        # Get extension alternatives from the path and puremagic
-        base, ext = os.path.splitext(path)
-        self._append_ext(extensions, ext)
-
-        for g in self._guess_ext_magic(path):
-            self._append_ext(extensions, g)
-
-        # Convert
-        return self._convert(path, extensions, **kwargs)
-
-    # TODO what should stream's type be?
-    def convert_stream(
-        self, stream: Any, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
-
-        # Save the file locally to a temporary file. It will be deleted before this method exits
-        handle, temp_path = tempfile.mkstemp()
-        fh = os.fdopen(handle, "wb")
-        result = None
-        try:
-            # Write to the temporary file
-            content = stream.read()
-            if isinstance(content, str):
-                fh.write(content.encode("utf-8"))
-            else:
-                fh.write(content)
-            fh.close()
-
-            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
-                self._append_ext(extensions, g)
-
-            # Convert
-            result = self._convert(temp_path, extensions, **kwargs)
-        # Clean up
-        finally:
-            try:
-                fh.close()
-            except Exception:
-                pass
-            os.unlink(temp_path)
-
-        return result
-
-    def convert_url(
-        self, url: str, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: fix kwargs type
-        # Send a HTTP request to the URL
-        response = self._requests_session.get(url, stream=True)
-        response.raise_for_status()
-        return self.convert_response(response, **kwargs)
-
-    def convert_response(
-        self, response: requests.Response, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO fix kwargs type
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
-
-        # Guess from the mimetype
-        content_type = response.headers.get("content-type", "").split(";")[0]
-        self._append_ext(extensions, mimetypes.guess_extension(content_type))
-
-        # Read the content disposition if there is one
-        content_disposition = response.headers.get("content-disposition", "")
-        m = re.search(r"filename=([^;]+)", content_disposition)
-        if m:
-            base, ext = os.path.splitext(m.group(1).strip("\"'"))
-            self._append_ext(extensions, ext)
-
-        # Read from the extension from the path
-        base, ext = os.path.splitext(urlparse(response.url).path)
-        self._append_ext(extensions, ext)
-
-        # Save the file locally to a temporary file. It will be deleted before this method exits
-        handle, temp_path = tempfile.mkstemp()
-        fh = os.fdopen(handle, "wb")
-        result = None
-        try:
-            # Download the file
-            for chunk in response.iter_content(chunk_size=512):
-                fh.write(chunk)
-            fh.close()
-
-            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
-                self._append_ext(extensions, g)
-
-            # Convert
-            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
-        # Clean up
-        finally:
-            try:
-                fh.close()
-            except Exception:
-                pass
-            os.unlink(temp_path)
-
-        return result
-
-    def _convert(
-        self, local_path: str, extensions: List[Union[str, None]], **kwargs
-    ) -> DocumentConverterResult:
-        error_trace = ""
-        for ext in extensions + [None]:  # Try last with no extension
-            for converter in self._page_converters:
-                _kwargs = copy.deepcopy(kwargs)
-
-                # Overwrite file_extension appropriately
-                if ext is None:
-                    if "file_extension" in _kwargs:
-                        del _kwargs["file_extension"]
-                else:
-                    _kwargs.update({"file_extension": ext})
-
-                # Copy any additional global options
-                if "llm_client" not in _kwargs and self._llm_client is not None:
-                    _kwargs["llm_client"] = self._llm_client
-
-                if "llm_model" not in _kwargs and self._llm_model is not None:
-                    _kwargs["llm_model"] = self._llm_model
-
-                if "style_map" not in _kwargs and self._style_map is not None:
-                    _kwargs["style_map"] = self._style_map
-
-                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
-                    _kwargs["exiftool_path"] = self._exiftool_path
-
-                # Add the list of converters for nested processing
-                _kwargs["_parent_converters"] = self._page_converters
-
-                # If we hit an error log it and keep trying
-                try:
-                    res = converter.convert(local_path, **_kwargs)
-                except Exception:
-                    error_trace = ("\n\n" + traceback.format_exc()).strip()
-
-                if res is not None:
-                    # Normalize the content
-                    res.text_content = "\n".join(
-                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
-                    )
-                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
-
-                    # Todo
-                    return res
-
-        # If we got this far without success, report any exceptions
-        if len(error_trace) > 0:
-            raise FileConversionException(
-                f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
-            )
-
-        # Nothing can handle it!
-        raise UnsupportedFormatException(
-            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
-        )
-
-    def _append_ext(self, extensions, ext):
-        """Append a unique non-None, non-empty extension to a list of extensions."""
-        if ext is None:
-            return
-        ext = ext.strip()
-        if ext == "":
-            return
-        # if ext not in extensions:
-        extensions.append(ext)
-
-    def _guess_ext_magic(self, path):
-        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
-        # Use puremagic to guess
-        try:
-            guesses = puremagic.magic_file(path)
-
-            # Fix for: https://github.com/microsoft/markitdown/issues/222
-            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
-            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
-            # (space, tab, newline, carriage return, vertical tab, form feed).
-            if len(guesses) == 0:
-                with open(path, "rb") as file:
-                    while True:
-                        char = file.read(1)
-                        if not char:  # End of file
-                            break
-                        if not char.isspace():
-                            file.seek(file.tell() - 1)
-                            break
-                    try:
-                        guesses = puremagic.magic_stream(file)
-                    except puremagic.main.PureError:
-                        pass
-
-            extensions = list()
-            for g in guesses:
-                ext = g.extension.strip()
-                if len(ext) > 0:
-                    if not ext.startswith("."):
-                        ext = "." + ext
-                    if ext not in extensions:
-                        extensions.append(ext)
-            return extensions
-        except FileNotFoundError:
-            pass
-        except IsADirectoryError:
-            pass
-        except PermissionError:
-            pass
-        return []
-
-    def register_page_converter(self, converter: DocumentConverter) -> None:
-        """Register a page text converter."""
-        self._page_converters.insert(0, converter)
diff --git a/src/markitdown/index.ts b/src/markitdown/index.ts
new file mode 100644
index 0000000..0aac794
--- /dev/null
+++ b/src/markitdown/index.ts
@@ -0,0 +1,52 @@
+import * as fs from 'fs';
+import * as path from 'path';
+import * as readline from 'readline';
+import { MarkItDown, DocumentConverterResult } from './markitdown';
+
+const markitdown = new MarkItDown();
+
+function convertFile(filePath: string, outputFilePath?: string): void {
+    const result: DocumentConverterResult = markitdown.convert(filePath);
+    handleOutput(result, outputFilePath);
+}
+
+function convertStream(inputStream: NodeJS.ReadableStream, outputFilePath?: string): void {
+    const result: DocumentConverterResult = markitdown.convertStream(inputStream);
+    handleOutput(result, outputFilePath);
+}
+
+function handleOutput(result: DocumentConverterResult, outputFilePath?: string): void {
+    if (outputFilePath) {
+        fs.writeFileSync(outputFilePath, result.text_content, 'utf-8');
+    } else {
+        console.log(result.text_content);
+    }
+}
+
+function main(): void {
+    const args = process.argv.slice(2);
+    const inputFilePath = args[0];
+    const outputFilePath = args[1];
+
+    if (inputFilePath) {
+        convertFile(inputFilePath, outputFilePath);
+    } else {
+        const rl = readline.createInterface({
+            input: process.stdin,
+            output: process.stdout,
+            terminal: false
+        });
+
+        let inputData = '';
+        rl.on('line', (line) => {
+            inputData += line + '\n';
+        });
+
+        rl.on('close', () => {
+            const inputStream = fs.createReadStream(inputData);
+            convertStream(inputStream, outputFilePath);
+        });
+    }
+}
+
+main();
diff --git a/src/markitdown/markitdown.ts b/src/markitdown/markitdown.ts
new file mode 100644
index 0000000..45739c4
--- /dev/null
+++ b/src/markitdown/markitdown.ts
@@ -0,0 +1,1072 @@
+import * as fs from 'fs';
+import * as path from 'path';
+import * as readline from 'readline';
+import * as mammoth from 'mammoth';
+import * as markdownify from 'markdownify';
+import * as olefile from 'olefile';
+import * as pdfminer from 'pdfminer';
+import * as pptx from 'pptx';
+import * as puremagic from 'puremagic';
+import * as requests from 'requests';
+import * as BeautifulSoup from 'beautifulsoup';
+import * as charset_normalizer from 'charset-normalizer';
+import * as pydub from 'pydub';
+import * as sr from 'speech_recognition';
+import * as YouTubeTranscriptApi from 'youtube_transcript_api';
+import * as pandas from 'pandas';
+import * as openai from 'openai';
+
+class CustomMarkdownify extends markdownify.MarkdownConverter {
+    constructor(options: any = {}) {
+        options.heading_style = options.heading_style || markdownify.ATX;
+        super(options);
+    }
+
+    convert_hn(n: number, el: any, text: string, convert_as_inline: boolean): string {
+        if (!convert_as_inline) {
+            if (!/^\n/.test(text)) {
+                return "\n" + super.convert_hn(n, el, text, convert_as_inline);
+            }
+        }
+        return super.convert_hn(n, el, text, convert_as_inline);
+    }
+
+    convert_a(el: any, text: string, convert_as_inline: boolean): string {
+        const { prefix, suffix, text: chompedText } = markdownify.chomp(text);
+        if (!chompedText) {
+            return "";
+        }
+        let href = el.get("href");
+        const title = el.get("title");
+
+        if (href) {
+            try {
+                const parsed_url = new URL(href);
+                if (parsed_url.protocol && !["http:", "https:", "file:"].includes(parsed_url.protocol.toLowerCase())) {
+                    return `${prefix}${chompedText}${suffix}`;
+                }
+                href = parsed_url.toString();
+            } catch (e) {
+                return `${prefix}${chompedText}${suffix}`;
+            }
+        }
+
+        if (this.options.autolinks && chompedText.replace(/\\_/g, "_") === href && !title && !this.options.default_title) {
+            return `<${href}>`;
+        }
+        const title_part = title ? ` "${title.replace(/"/g, '\\"')}"` : "";
+        return href ? `${prefix}[${chompedText}](${href}${title_part})${suffix}` : chompedText;
+    }
+
+    convert_img(el: any, text: string, convert_as_inline: boolean): string {
+        const alt = el.attrs.alt || "";
+        let src = el.attrs.src || "";
+        const title = el.attrs.title || "";
+        const title_part = title ? ` "${title.replace(/"/g, '\\"')}"` : "";
+
+        if (convert_as_inline && !this.options.keep_inline_images_in.includes(el.parent.name)) {
+            return alt;
+        }
+
+        if (src.startsWith("data:")) {
+            src = src.split(",")[0] + "...";
+        }
+
+        return `![${alt}](${src}${title_part})`;
+    }
+
+    convert_soup(soup: any): string {
+        return super.convert_soup(soup);
+    }
+}
+
+class DocumentConverterResult {
+    title: string | null;
+    text_content: string;
+
+    constructor(title: string | null = null, text_content: string = "") {
+        this.title = title;
+        this.text_content = text_content;
+    }
+}
+
+abstract class DocumentConverter {
+    abstract convert(local_path: string, ...args: any[]): DocumentConverterResult | null;
+}
+
+class PlainTextConverter extends DocumentConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const content_type = mime.lookup(local_path) || "";
+        if (!content_type.startsWith("text/") && content_type !== "application/json") {
+            return null;
+        }
+
+        const text_content = fs.readFileSync(local_path, 'utf-8');
+        return new DocumentConverterResult(null, text_content);
+    }
+}
+
+class HtmlConverter extends DocumentConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (![".html", ".htm"].includes(extension)) {
+            return null;
+        }
+
+        const html_content = fs.readFileSync(local_path, 'utf-8');
+        return this._convert(html_content);
+    }
+
+    _convert(html_content: string): DocumentConverterResult | null {
+        const soup = new BeautifulSoup(html_content, "html.parser");
+
+        for (const script of soup(["script", "style"])) {
+            script.extract();
+        }
+
+        const body_elm = soup.find("body");
+        const webpage_text = body_elm ? new CustomMarkdownify().convert_soup(body_elm) : new CustomMarkdownify().convert_soup(soup);
+
+        return new DocumentConverterResult(soup.title ? soup.title.string : null, webpage_text);
+    }
+}
+
+class RSSConverter extends DocumentConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (![".xml", ".rss", ".atom"].includes(extension)) {
+            return null;
+        }
+
+        const doc = new DOMParser().parseFromString(fs.readFileSync(local_path, 'utf-8'), "application/xml");
+        if (doc.getElementsByTagName("rss").length > 0) {
+            return this._parse_rss_type(doc);
+        } else if (doc.getElementsByTagName("feed").length > 0) {
+            return this._parse_atom_type(doc);
+        } else {
+            return null;
+        }
+    }
+
+    _parse_atom_type(doc: Document): DocumentConverterResult | null {
+        try {
+            const root = doc.getElementsByTagName("feed")[0];
+            const title = this._get_data_by_tag_name(root, "title");
+            const subtitle = this._get_data_by_tag_name(root, "subtitle");
+            const entries = root.getElementsByTagName("entry");
+            let md_text = `# ${title}\n`;
+            if (subtitle) {
+                md_text += `${subtitle}\n`;
+            }
+            for (const entry of entries) {
+                const entry_title = this._get_data_by_tag_name(entry, "title");
+                const entry_summary = this._get_data_by_tag_name(entry, "summary");
+                const entry_updated = this._get_data_by_tag_name(entry, "updated");
+                const entry_content = this._get_data_by_tag_name(entry, "content");
+
+                if (entry_title) {
+                    md_text += `\n## ${entry_title}\n`;
+                }
+                if (entry_updated) {
+                    md_text += `Updated on: ${entry_updated}\n`;
+                }
+                if (entry_summary) {
+                    md_text += this._parse_content(entry_summary);
+                }
+                if (entry_content) {
+                    md_text += this._parse_content(entry_content);
+                }
+            }
+
+            return new DocumentConverterResult(title, md_text);
+        } catch (e) {
+            return null;
+        }
+    }
+
+    _parse_rss_type(doc: Document): DocumentConverterResult | null {
+        try {
+            const root = doc.getElementsByTagName("rss")[0];
+            const channel = root.getElementsByTagName("channel")[0];
+            const channel_title = this._get_data_by_tag_name(channel, "title");
+            const channel_description = this._get_data_by_tag_name(channel, "description");
+            const items = channel.getElementsByTagName("item");
+            let md_text = `# ${channel_title}\n`;
+            if (channel_description) {
+                md_text += `${channel_description}\n`;
+            }
+            for (const item of items) {
+                const title = this._get_data_by_tag_name(item, "title");
+                const description = this._get_data_by_tag_name(item, "description");
+                const pubDate = this._get_data_by_tag_name(item, "pubDate");
+                const content = this._get_data_by_tag_name(item, "content:encoded");
+
+                if (title) {
+                    md_text += `\n## ${title}\n`;
+                }
+                if (pubDate) {
+                    md_text += `Published on: ${pubDate}\n`;
+                }
+                if (description) {
+                    md_text += this._parse_content(description);
+                }
+                if (content) {
+                    md_text += this._parse_content(content);
+                }
+            }
+
+            return new DocumentConverterResult(channel_title, md_text);
+        } catch (e) {
+            return null;
+        }
+    }
+
+    _parse_content(content: string): string {
+        try {
+            const soup = new BeautifulSoup(content, "html.parser");
+            return new CustomMarkdownify().convert_soup(soup);
+        } catch (e) {
+            return content;
+        }
+    }
+
+    _get_data_by_tag_name(element: Element, tag_name: string): string | null {
+        const nodes = element.getElementsByTagName(tag_name);
+        if (nodes.length === 0) {
+            return null;
+        }
+        const fc = nodes[0].firstChild;
+        return fc ? fc.nodeValue : null;
+    }
+}
+
+class WikipediaConverter extends HtmlConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        const url = args[0]?.url || "";
+        if (![".html", ".htm"].includes(extension) || !/^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\//.test(url)) {
+            return null;
+        }
+
+        const html_content = fs.readFileSync(local_path, 'utf-8');
+        const soup = new BeautifulSoup(html_content, "html.parser");
+
+        for (const script of soup(["script", "style"])) {
+            script.extract();
+        }
+
+        const body_elm = soup.find("div", { id: "mw-content-text" });
+        const title_elm = soup.find("span", { class: "mw-page-title-main" });
+
+        let webpage_text = "";
+        let main_title = soup.title ? soup.title.string : null;
+
+        if (body_elm) {
+            if (title_elm && title_elm.length > 0) {
+                main_title = title_elm.string;
+            }
+            webpage_text = `# ${main_title}\n\n` + new CustomMarkdownify().convert_soup(body_elm);
+        } else {
+            webpage_text = new CustomMarkdownify().convert_soup(soup);
+        }
+
+        return new DocumentConverterResult(main_title, webpage_text);
+    }
+}
+
+class YouTubeConverter extends HtmlConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        const url = args[0]?.url || "";
+        if (![".html", ".htm"].includes(extension) || !url.startsWith("https://www.youtube.com/watch?")) {
+            return null;
+        }
+
+        const html_content = fs.readFileSync(local_path, 'utf-8');
+        const soup = new BeautifulSoup(html_content, "html.parser");
+
+        const metadata: { [key: string]: string } = { title: soup.title ? soup.title.string : "" };
+        for (const meta of soup(["meta"])) {
+            for (const a in meta.attrs) {
+                if (["itemprop", "property", "name"].includes(a)) {
+                    metadata[meta[a]] = meta.get("content", "");
+                    break;
+                }
+            }
+        }
+
+        try {
+            for (const script of soup(["script"])) {
+                const content = script.text;
+                if (content.includes("ytInitialData")) {
+                    const lines = content.split(/\r?\n/);
+                    const obj_start = lines[0].indexOf("{");
+                    const obj_end = lines[0].lastIndexOf("}");
+                    if (obj_start >= 0 && obj_end >= 0) {
+                        const data = JSON.parse(lines[0].substring(obj_start, obj_end + 1));
+                        const attrdesc = this._findKey(data, "attributedDescriptionBodyText");
+                        if (attrdesc) {
+                            metadata["description"] = attrdesc.content;
+                        }
+                    }
+                    break;
+                }
+            }
+        } catch (e) {}
+
+        let webpage_text = "# YouTube\n";
+        const title = this._get(metadata, ["title", "og:title", "name"]) || "";
+        if (title) {
+            webpage_text += `\n## ${title}\n`;
+        }
+
+        let stats = "";
+        const views = this._get(metadata, ["interactionCount"]);
+        if (views) {
+            stats += `- **Views:** ${views}\n`;
+        }
+
+        const keywords = this._get(metadata, ["keywords"]);
+        if (keywords) {
+            stats += `- **Keywords:** ${keywords}\n`;
+        }
+
+        const runtime = this._get(metadata, ["duration"]);
+        if (runtime) {
+            stats += `- **Runtime:** ${runtime}\n`;
+        }
+
+        if (stats) {
+            webpage_text += `\n### Video Metadata\n${stats}\n`;
+        }
+
+        const description = this._get(metadata, ["description", "og:description"]);
+        if (description) {
+            webpage_text += `\n### Description\n${description}\n`;
+        }
+
+        if (YouTubeTranscriptApi) {
+            let transcript_text = "";
+            const parsed_url = new URL(url);
+            const params = new URLSearchParams(parsed_url.search);
+            if (params.has("v")) {
+                const video_id = params.get("v");
+                try {
+                    const youtube_transcript_languages = args[0]?.youtube_transcript_languages || ["en"];
+                    const transcript = YouTubeTranscriptApi.getTranscript(video_id, { languages: youtube_transcript_languages });
+                    transcript_text = transcript.map((part: any) => part.text).join(" ");
+                } catch (e) {}
+            }
+            if (transcript_text) {
+                webpage_text += `\n### Transcript\n${transcript_text}\n`;
+            }
+        }
+
+        return new DocumentConverterResult(title, webpage_text);
+    }
+
+    _get(metadata: { [key: string]: string }, keys: string[], defaultValue: string | null = null): string | null {
+        for (const key of keys) {
+            if (metadata[key]) {
+                return metadata[key];
+            }
+        }
+        return defaultValue;
+    }
+
+    _findKey(json: any, key: string): any {
+        if (Array.isArray(json)) {
+            for (const elm of json) {
+                const ret = this._findKey(elm, key);
+                if (ret !== null) {
+                    return ret;
+                }
+            }
+        } else if (typeof json === "object") {
+            for (const k in json) {
+                if (k === key) {
+                    return json[k];
+                } else {
+                    const ret = this._findKey(json[k], key);
+                    if (ret !== null) {
+                        return ret;
+                    }
+                }
+            }
+        }
+        return null;
+    }
+}
+
+class IpynbConverter extends DocumentConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".ipynb") {
+            return null;
+        }
+
+        const notebook_content = JSON.parse(fs.readFileSync(local_path, 'utf-8'));
+        return this._convert(notebook_content);
+    }
+
+    _convert(notebook_content: any): DocumentConverterResult | null {
+        try {
+            const md_output: string[] = [];
+            let title: string | null = null;
+
+            for (const cell of notebook_content.cells) {
+                const cell_type = cell.cell_type;
+                const source_lines = cell.source;
+
+                if (cell_type === "markdown") {
+                    md_output.push(source_lines.join(""));
+
+                    if (!title) {
+                        for (const line of source_lines) {
+                            if (line.startsWith("# ")) {
+                                title = line.replace(/^# /, "").trim();
+                                break;
+                            }
+                        }
+                    }
+                } else if (cell_type === "code") {
+                    md_output.push(`\`\`\`python\n${source_lines.join("")}\n\`\`\``);
+                } else if (cell_type === "raw") {
+                    md_output.push(`\`\`\`\n${source_lines.join("")}\n\`\`\``);
+                }
+            }
+
+            const md_text = md_output.join("\n\n");
+            title = notebook_content.metadata.title || title;
+
+            return new DocumentConverterResult(title, md_text);
+        } catch (e) {
+            throw new Error(`Error converting .ipynb file: ${e.message}`);
+        }
+    }
+}
+
+class BingSerpConverter extends HtmlConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        const url = args[0]?.url || "";
+        if (![".html", ".htm"].includes(extension) || !/^https:\/\/www\.bing\.com\/search\?q=/.test(url)) {
+            return null;
+        }
+
+        const html_content = fs.readFileSync(local_path, 'utf-8');
+        const soup = new BeautifulSoup(html_content, "html.parser");
+
+        for (const tptt of soup.find_all({ class: "tptt" })) {
+            if (tptt.string) {
+                tptt.string += " ";
+            }
+        }
+        for (const slug of soup.find_all({ class: "algoSlug_icon" })) {
+            slug.extract();
+        }
+
+        const results: string[] = [];
+        for (const result of soup.find_all({ class: "b_algo" })) {
+            for (const a of result.find_all("a", { href: true })) {
+                const parsed_href = new URL(a.attrs.href);
+                const qs = new URLSearchParams(parsed_href.search);
+
+                if (qs.has("u")) {
+                    let u = qs.get("u") || "";
+                    u = u.slice(2).trim() + "==";
+
+                    try {
+                        a.attrs.href = Buffer.from(u, "base64").toString("utf-8");
+                    } catch (e) {}
+                }
+            }
+
+            const md_result = new CustomMarkdownify().convert_soup(result).trim();
+            const lines = md_result.split(/\n+/).map(line => line.trim());
+            results.push(lines.filter(line => line.length > 0).join("\n"));
+        }
+
+        const query = new URLSearchParams(new URL(url).search).get("q") || "";
+        const webpage_text = `## A Bing search for '${query}' found the following results:\n\n${results.join("\n\n")}`;
+
+        return new DocumentConverterResult(soup.title ? soup.title.string : null, webpage_text);
+    }
+}
+
+class PdfConverter extends DocumentConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".pdf") {
+            return null;
+        }
+
+        const text_content = pdfminer.high_level.extract_text(local_path);
+        return new DocumentConverterResult(null, text_content);
+    }
+}
+
+class DocxConverter extends HtmlConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".docx") {
+            return null;
+        }
+
+        const result = mammoth.convert_to_html({ path: local_path });
+        return this._convert(result.value);
+    }
+}
+
+class XlsxConverter extends HtmlConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".xlsx") {
+            return null;
+        }
+
+        const sheets = pandas.read_excel(local_path, { sheet_name: null, engine: "openpyxl" });
+        let md_content = "";
+        for (const sheet in sheets) {
+            md_content += `## ${sheet}\n`;
+            const html_content = sheets[sheet].to_html({ index: false });
+            md_content += this._convert(html_content).text_content.trim() + "\n\n";
+        }
+
+        return new DocumentConverterResult(null, md_content.trim());
+    }
+}
+
+class XlsConverter extends HtmlConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".xls") {
+            return null;
+        }
+
+        const sheets = pandas.read_excel(local_path, { sheet_name: null, engine: "xlrd" });
+        let md_content = "";
+        for (const sheet in sheets) {
+            md_content += `## ${sheet}\n`;
+            const html_content = sheets[sheet].to_html({ index: false });
+            md_content += this._convert(html_content).text_content.trim() + "\n\n";
+        }
+
+        return new DocumentConverterResult(null, md_content.trim());
+    }
+}
+
+class PptxConverter extends HtmlConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".pptx") {
+            return null;
+        }
+
+        const presentation = new pptx.Presentation(local_path);
+        let md_content = "";
+        let slide_num = 0;
+        for (const slide of presentation.slides) {
+            slide_num += 1;
+            md_content += `\n\n<!-- Slide number: ${slide_num} -->\n`;
+
+            const title = slide.shapes.title;
+            for (const shape of slide.shapes) {
+                if (this._is_picture(shape)) {
+                    let alt_text = "";
+                    try {
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.descr || "";
+                    } catch (e) {}
+
+                    const filename = shape.name.replace(/\W/g, "") + ".jpg";
+                    md_content += `\n![${alt_text || shape.name}](${filename})\n`;
+                }
+
+                if (this._is_table(shape)) {
+                    let html_table = "<html><body><table>";
+                    let first_row = true;
+                    for (const row of shape.table.rows) {
+                        html_table += "<tr>";
+                        for (const cell of row.cells) {
+                            if (first_row) {
+                                html_table += `<th>${html.escape(cell.text)}</th>`;
+                            } else {
+                                html_table += `<td>${html.escape(cell.text)}</td>`;
+                            }
+                        }
+                        html_table += "</tr>";
+                        first_row = false;
+                    }
+                    html_table += "</table></body></html>";
+                    md_content += `\n${this._convert(html_table).text_content.trim()}\n`;
+                }
+
+                if (shape.has_chart) {
+                    md_content += this._convert_chart_to_markdown(shape.chart);
+                } else if (shape.has_text_frame) {
+                    if (shape === title) {
+                        md_content += `# ${shape.text.trim()}\n`;
+                    } else {
+                        md_content += `${shape.text.trim()}\n`;
+                    }
+                }
+            }
+
+            if (slide.has_notes_slide) {
+                md_content += "\n\n### Notes:\n";
+                const notes_frame = slide.notes_slide.notes_text_frame;
+                if (notes_frame) {
+                    md_content += notes_frame.text.trim();
+                }
+            }
+        }
+
+        return new DocumentConverterResult(null, md_content.trim());
+    }
+
+    _is_picture(shape: any): boolean {
+        return shape.shape_type === pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE || (shape.shape_type === pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER && shape.image);
+    }
+
+    _is_table(shape: any): boolean {
+        return shape.shape_type === pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE;
+    }
+
+    _convert_chart_to_markdown(chart: any): string {
+        let md = "\n\n### Chart";
+        if (chart.has_title) {
+            md += `: ${chart.chart_title.text_frame.text}`;
+        }
+        md += "\n\n";
+        const data: any[] = [];
+        const category_names = chart.plots[0].categories.map((c: any) => c.label);
+        const series_names = chart.series.map((s: any) => s.name);
+        data.push(["Category", ...series_names]);
+
+        for (let i = 0; i < category_names.length; i++) {
+            const row = [category_names[i]];
+            for (const series of chart.series) {
+                row.push(series.values[i]);
+            }
+            data.push(row);
+        }
+
+        const markdown_table = data.map(row => `| ${row.join(" | ")} |`);
+        const header = markdown_table[0];
+        const separator = `|${"|".repeat(data[0].length - 1)}|`;
+        return md + [header, separator, ...markdown_table.slice(1)].join("\n");
+    }
+}
+
+class MediaConverter extends DocumentConverter {
+    _get_metadata(local_path: string, exiftool_path?: string): any {
+        if (!exiftool_path) {
+            const which_exiftool = which("exiftool");
+            if (which_exiftool) {
+                console.warn(`Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown constructor. E.g., md = new MarkItDown({ exiftool_path: "${which_exiftool}" })`);
+            }
+            return null;
+        } else {
+            try {
+                const result = execSync(`${exiftool_path} -json ${local_path}`).toString();
+                return JSON.parse(result)[0];
+            } catch (e) {
+                return null;
+            }
+        }
+    }
+}
+
+class WavConverter extends MediaConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".wav") {
+            return null;
+        }
+
+        let md_content = "";
+
+        const metadata = this._get_metadata(local_path, args[0]?.exiftool_path);
+        if (metadata) {
+            for (const f of ["Title", "Artist", "Author", "Band", "Album", "Genre", "Track", "DateTimeOriginal", "CreateDate", "Duration"]) {
+                if (metadata[f]) {
+                    md_content += `${f}: ${metadata[f]}\n`;
+                }
+            }
+        }
+
+        if (sr) {
+            try {
+                const transcript = this._transcribe_audio(local_path);
+                md_content += `\n\n### Audio Transcript:\n${transcript || "[No speech detected]"}`;
+            } catch (e) {
+                md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
+            }
+        }
+
+        return new DocumentConverterResult(null, md_content.trim());
+    }
+
+    _transcribe_audio(local_path: string): string {
+        const recognizer = new sr.Recognizer();
+        const audio = recognizer.record(new sr.AudioFile(local_path));
+        return recognizer.recognize_google(audio).trim();
+    }
+}
+
+class Mp3Converter extends WavConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".mp3") {
+            return null;
+        }
+
+        let md_content = "";
+
+        const metadata = this._get_metadata(local_path, args[0]?.exiftool_path);
+        if (metadata) {
+            for (const f of ["Title", "Artist", "Author", "Band", "Album", "Genre", "Track", "DateTimeOriginal", "CreateDate", "Duration"]) {
+                if (metadata[f]) {
+                    md_content += `${f}: ${metadata[f]}\n`;
+                }
+            }
+        }
+
+        if (sr && pydub) {
+            const temp_path = path.join(os.tmpdir(), `${path.basename(local_path, ".mp3")}.wav`);
+            try {
+                const sound = pydub.AudioSegment.from_mp3(local_path);
+                sound.export(temp_path, { format: "wav" });
+
+                const transcript = this._transcribe_audio(temp_path).trim();
+                md_content += `\n\n### Audio Transcript:\n${transcript || "[No speech detected]"}`;
+            } catch (e) {
+                md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
+            } finally {
+                fs.unlinkSync(temp_path);
+            }
+        }
+
+        return new DocumentConverterResult(null, md_content.trim());
+    }
+}
+
+class ImageConverter extends MediaConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (![".jpg", ".jpeg", ".png"].includes(extension)) {
+            return null;
+        }
+
+        let md_content = "";
+
+        const metadata = this._get_metadata(local_path, args[0]?.exiftool_path);
+        if (metadata) {
+            for (const f of ["ImageSize", "Title", "Caption", "Description", "Keywords", "Artist", "Author", "DateTimeOriginal", "CreateDate", "GPSPosition"]) {
+                if (metadata[f]) {
+                    md_content += `${f}: ${metadata[f]}\n`;
+                }
+            }
+        }
+
+        const llm_client = args[0]?.llm_client;
+        const llm_model = args[0]?.llm_model;
+        if (llm_client && llm_model) {
+            md_content += `\n# Description:\n${this._get_llm_description(local_path, extension, llm_client, llm_model, args[0]?.llm_prompt).trim()}\n`;
+        }
+
+        return new DocumentConverterResult(null, md_content);
+    }
+
+    _get_llm_description(local_path: string, extension: string, client: any, model: string, prompt: string = "Write a detailed caption for this image."): string {
+        const content_type = mime.lookup(extension) || "image/jpeg";
+        const image_base64 = fs.readFileSync(local_path, 'base64');
+        const data_uri = `data:${content_type};base64,${image_base64}`;
+
+        const messages = [
+            {
+                role: "user",
+                content: [
+                    { type: "text", text: prompt },
+                    { type: "image_url", image_url: { url: data_uri } }
+                ]
+            }
+        ];
+
+        const response = client.chat.completions.create({ model, messages });
+        return response.choices[0].message.content;
+    }
+}
+
+class OutlookMsgConverter extends DocumentConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".msg") {
+            return null;
+        }
+
+        try {
+            const msg = new olefile.OleFileIO(local_path);
+            let md_content = "# Email Message\n\n";
+
+            const headers = {
+                "From": this._get_stream_data(msg, "__substg1.0_0C1F001F"),
+                "To": this._get_stream_data(msg, "__substg1.0_0E04001F"),
+                "Subject": this._get_stream_data(msg, "__substg1.0_0037001F")
+            };
+
+            for (const key in headers) {
+                if (headers[key]) {
+                    md_content += `**${key}:** ${headers[key]}\n`;
+                }
+            }
+
+            md_content += "\n## Content\n\n";
+            const body = this._get_stream_data(msg, "__substg1.0_1000001F");
+            if (body) {
+                md_content += body;
+            }
+
+            msg.close();
+            return new DocumentConverterResult(headers.Subject, md_content.trim());
+        } catch (e) {
+            throw new Error(`Could not convert MSG file '${local_path}': ${e.message}`);
+        }
+    }
+
+    _get_stream_data(msg: any, stream_path: string): string | null {
+        try {
+            if (msg.exists(stream_path)) {
+                const data = msg.openstream(stream_path).read();
+                try {
+                    return data.toString("utf-16le").trim();
+                } catch (e) {
+                    try {
+                        return data.toString("utf-8").trim();
+                    } catch (e) {
+                        return data.toString("utf-8", { errors: "ignore" }).trim();
+                    }
+                }
+            }
+        } catch (e) {}
+        return null;
+    }
+}
+
+class ZipConverter extends DocumentConverter {
+    convert(local_path: string, ...args: any[]): DocumentConverterResult | null {
+        const extension = path.extname(local_path).toLowerCase();
+        if (extension !== ".zip") {
+            return null;
+        }
+
+        const parent_converters = args[0]?._parent_converters || [];
+        if (!parent_converters.length) {
+            return new DocumentConverterResult(null, `[ERROR] No converters available to process zip contents from: ${local_path}`);
+        }
+
+        const extracted_zip_folder_name = `extracted_${path.basename(local_path, ".zip")}_zip`;
+        const extraction_dir = path.join(path.dirname(local_path), extracted_zip_folder_name);
+        let md_content = `Content from the zip file \`${path.basename(local_path)}\`:\n\n`;
+
+        try {
+            const zip = new AdmZip(local_path);
+            zip.extractAllTo(extraction_dir, true);
+
+            for (const file of zip.getEntries()) {
+                const file_path = path.join(extraction_dir, file.entryName);
+                const relative_path = path.relative(extraction_dir, file_path);
+                const file_extension = path.extname(file_path);
+
+                const file_args = { ...args[0], file_extension, _parent_converters: parent_converters };
+                for (const converter of parent_converters) {
+                    if (converter instanceof ZipConverter) {
+                        continue;
+                    }
+
+                    const result = converter.convert(file_path, file_args);
+                    if (result) {
+                        md_content += `\n## File: ${relative_path}\n\n${result.text_content}\n\n`;
+                        break;
+                    }
+                }
+            }
+
+            if (args[0]?.cleanup_extracted !== false) {
+                fs.rmdirSync(extraction_dir, { recursive: true });
+            }
+
+            return new DocumentConverterResult(null, md_content.trim());
+        } catch (e) {
+            return new DocumentConverterResult(null, `[ERROR] Failed to process zip file ${local_path}: ${e.message}`);
+        }
+    }
+}
+
+class MarkItDown {
+    private _requests_session: any;
+    private _llm_client: any;
+    private _llm_model: string | null;
+    private _style_map: string | null;
+    private _exiftool_path: string | null;
+    private _page_converters: DocumentConverter[];
+
+    constructor(options: any = {}) {
+        this._requests_session = options.requests_session || new requests.Session();
+        this._llm_client = options.llm_client || null;
+        this._llm_model = options.llm_model || null;
+        this._style_map = options.style_map || null;
+        this._exiftool_path = options.exiftool_path || process.env.EXIFTOOL_PATH || null;
+
+        this._page_converters = [];
+
+        this.register_page_converter(new PlainTextConverter());
+        this.register_page_converter(new HtmlConverter());
+        this.register_page_converter(new RSSConverter());
+        this.register_page_converter(new WikipediaConverter());
+        this.register_page_converter(new YouTubeConverter());
+        this.register_page_converter(new BingSerpConverter());
+        this.register_page_converter(new DocxConverter());
+        this.register_page_converter(new XlsxConverter());
+        this.register_page_converter(new XlsConverter());
+        this.register_page_converter(new PptxConverter());
+        this.register_page_converter(new WavConverter());
+        this.register_page_converter(new Mp3Converter());
+        this.register_page_converter(new ImageConverter());
+        this.register_page_converter(new IpynbConverter());
+        this.register_page_converter(new PdfConverter());
+        this.register_page_converter(new ZipConverter());
+        this.register_page_converter(new OutlookMsgConverter());
+    }
+
+    convert(source: string | requests.Response | Path, ...args: any[]): DocumentConverterResult {
+        if (typeof source === "string") {
+            if (/^https?:\/\//.test(source) || /^file:\/\//.test(source)) {
+                return this.convert_url(source, ...args);
+            } else {
+                return this.convert_local(source, ...args);
+            }
+        } else if (source instanceof requests.Response) {
+            return this.convert_response(source, ...args);
+        } else if (source instanceof Path) {
+            return this.convert_local(source.toString(), ...args);
+        }
+    }
+
+    convert_local(path: string, ...args: any[]): DocumentConverterResult {
+        const extensions = [args[0]?.file_extension || null];
+        const ext = path.extname(path);
+        if (ext) {
+            extensions.push(ext);
+        }
+
+        for (const g of this._guess_ext_magic(path)) {
+            extensions.push(g);
+        }
+
+        return this._convert(path, extensions, ...args);
+    }
+
+    convert_stream(stream: any, ...args: any[]): DocumentConverterResult {
+        const extensions = [args[0]?.file_extension || null];
+        const temp_path = path.join(os.tmpdir(), `temp_${Date.now()}`);
+        fs.writeFileSync(temp_path, stream);
+
+        for (const g of this._guess_ext_magic(temp_path)) {
+            extensions.push(g);
+        }
+
+        const result = this._convert(temp_path, extensions, ...args);
+        fs.unlinkSync(temp_path);
+        return result;
+    }
+
+    convert_url(url: string, ...args: any[]): DocumentConverterResult {
+        const response = this._requests_session.get(url, { responseType: 'stream' });
+        return this.convert_response(response, ...args);
+    }
+
+    convert_response(response: any, ...args: any[]): DocumentConverterResult {
+        const extensions = [args[0]?.file_extension || null];
+        const content_type = response.headers['content-type'].split(";")[0];
+        const ext = mime.extension(content_type);
+        if (ext) {
+            extensions.push(ext);
+        }
+
+        const content_disposition = response.headers['content-disposition'] || "";
+        const filename_match = content_disposition.match(/filename=([^;]+)/);
+        if (filename_match) {
+            const filename_ext = path.extname(filename_match[1].replace(/['"]/g, ""));
+            if (filename_ext) {
+                extensions.push(filename_ext);
+            }
+        }
+
+        const url_ext = path.extname(new URL(response.url).pathname);
+        if (url_ext) {
+            extensions.push(url_ext);
+        }
+
+        const temp_path = path.join(os.tmpdir(), `temp_${Date.now()}`);
+        const writer = fs.createWriteStream(temp_path);
+        response.data.pipe(writer);
+
+        return new Promise((resolve, reject) => {
+            writer.on('finish', () => {
+                for (const g of this._guess_ext_magic(temp_path)) {
+                    extensions.push(g);
+                }
+
+                const result = this._convert(temp_path, extensions, ...args);
+                fs.unlinkSync(temp_path);
+                resolve(result);
+            });
+            writer.on('error', reject);
+        });
+    }
+
+    _convert(local_path: string, extensions: (string | null)[], ...args: any[]): DocumentConverterResult {
+        for (const ext of extensions) {
+            for (const converter of this._page_converters) {
+                const file_args = { ...args[0], file_extension: ext };
+                try {
+                    const result = converter.convert(local_path, file_args);
+                    if (result) {
+                        result.text_content = result.text_content.split(/\r?\n/).map(line => line.trimEnd()).join("\n");
+                        result.text_content = result.text_content.replace(/\n{3,}/g, "\n\n");
+                        return result;
+                    }
+                } catch (e) {
+                    console.error(e);
+                }
+            }
+        }
+
+        throw new Error(`Could not convert '${local_path}' to Markdown. The formats ${extensions} are not supported.`);
+    }
+
+    _guess_ext_magic(path: string): string[] {
+        try {
+            const guesses = puremagic.magic_file(path);
+            if (!guesses.length) {
+                const file = fs.createReadStream(path);
+                const trimmed_file = file.pipe(new stream.Transform({
+                    transform(chunk, encoding, callback) {
+                        this.push(chunk.toString().trimStart());
+                        callback();
+                    }
+                }));
+                guesses.push(...puremagic.magic_stream(trimmed_file));
+            }
+
+            return guesses.map(g => g.extension.startsWith(".") ? g.extension : `.${g.extension}`);
+        } catch (e) {
+            return [];
+        }
+    }
+
+    register_page_converter(converter: DocumentConverter): void {
+        this._page_converters.unshift(converter);
+    }
+}
diff --git a/src/markitdown/py.typed b/src/markitdown/py.typed
deleted file mode 100644
index e69de29..0000000
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..2ea1a0d
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "target": "ES6",
+    "module": "commonjs",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "outDir": "./dist",
+    "rootDir": "./src"
+  },
+  "include": ["src/**/*.ts"],
+  "exclude": ["node_modules", "**/*.spec.ts"]
+}