Merge branch 'main' into wangsrGit119-patch-1

2024-12-23 10:54:52 +08:00 · 2024-12-23 10:54:52 +08:00 · 5a3ca479f1
commit 5a3ca479f1
parent 02cc0cef84 125e206047
25 changed files with 936 additions and 119 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -0,0 +1,32 @@
 // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 {
 	"name": "Existing Dockerfile",
 	"build": {
 		// Sets the run context to one level up instead of the .devcontainer folder.
 		"context": "..",
 		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
 		"dockerfile": "../Dockerfile",
 		"args": {
 			"INSTALL_GIT": "true"
 		}
 	},
 	// Features to add to the dev container. More info: https://containers.dev/features.
 	// "features": {},
 	"features": {
 		"ghcr.io/devcontainers-extra/features/hatch:2": {}
 	},
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
 	// "forwardPorts": [],
 	// Uncomment the next line to run commands after the container is created.
 	// "postCreateCommand": "cat /etc/os-release",
 	// Configure tool-specific properties.
 	// "customizations": {},
 	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
 	"remoteUser": "root"
 }
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1 @@
 *
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
 tests/test_files/** linguist-vendored
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,6 @@
 version: 2
 updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -5,9 +5,9 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
      - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
        with:
          python-version: "3.x"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -5,8 +5,8 @@ jobs:
  tests:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: |
            3.10
@ -14,7 +14,7 @@ jobs:
            3.12
      - name: Set up pip cache
        if: runner.os == 'Linux'
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: ~/.cache/pip
          key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .vscode
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@ -160,3 +162,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 src/.DS_Store
 .DS_Store
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 FROM python:3.13-slim-bullseye
 USER root
 ARG INSTALL_GIT=false
 RUN if [ "$INSTALL_GIT" = "true" ]; then \
    apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
    fi
 # Runtime dependency
 RUN apt-get update && apt-get install -y --no-install-recommends \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 RUN pip install markitdown
 # Default USERID and GROUPID
 ARG USERID=10000
 ARG GROUPID=10000
 USER $USERID:$GROUPID
 ENTRYPOINT [ "markitdown" ]
--- a/README.md
+++ b/README.md
@ -1,56 +1,114 @@
 > [!IMPORTANT]
 > (12/19/24) Hello! MarkItDown team members will be resting and recharging with family and friends over the holiday period. Activity/responses on the project may be delayed during the period of Dec 21-Jan 06. We will be excited to engage with you in the new year!
 # MarkItDown
-The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
+[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
 ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
 [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
 It presently supports:
- PDF (.pdf)
+MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
- PowerPoint (.pptx)
+It supports:
- Word (.docx)
+- PDF
- Excel (.xlsx)
+- PowerPoint
- Images (EXIF metadata, and OCR)
+- Word
- Audio (EXIF metadata, and speech transcription)
+- Excel
- HTML (special handling of Wikipedia, etc.)
+- Images (EXIF metadata and OCR)
- Various other text-based formats (csv, json, xml, etc.)
+- Audio (EXIF metadata and speech transcription)
 - HTML
 - Text-based formats (CSV, JSON, XML)
 - ZIP files (iterates over contents)
-# Installation
+To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
-You can install `markitdown` using pip:
+## Usage
-```python
+### Command-Line
-pip install markitdown
+
 ```bash
 markitdown path-to-file.pdf > document.md
 ```
-or from the source
+Or use `-o` to specify the output file:
-```sh
+```bash
-pip install -e .
+markitdown path-to-file.pdf -o document.md
 ```
 You can also pipe content:
-# Usage
+```bash
-The API is simple:
+cat path-to-file.pdf | markitdown
 ```
 ### Python API
 Basic usage in Python:
 ```python
 from markitdown import MarkItDown
-markitdown = MarkItDown()
+md = MarkItDown()
-result = markitdown.convert("test.xlsx")
+result = md.convert("test.xlsx")
 print(result.text_content)
 ```
-You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client.
+To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
 ```python
 from markitdown import MarkItDown
 from openai import OpenAI
 client = OpenAI()
-md = MarkItDown(mlm_client=client, mlm_model="gpt-4o")
+md = MarkItDown(llm_client=client, llm_model="gpt-4o")
 result = md.convert("example.jpg")
 print(result.text_content)
 ```
 ### Docker
 ```sh
 docker build -t markitdown:latest .
 docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 ```
 <details>
 <summary>Batch Processing Multiple Files</summary>
 This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
 ```python convert.py
 from markitdown import MarkItDown
 from openai import OpenAI
 import os
 client = OpenAI(api_key="your-api-key-here")
 md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
 supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
 files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
 for file in files_to_convert:
    print(f"\nConverting {file}...")
    try:
        md_file = os.path.splitext(file)[0] + '.md'
        result = md.convert(file)
        with open(md_file, 'w') as f:
            f.write(result.text_content)
        print(f"Successfully converted {file} to {md_file}")
    except Exception as e:
        print(f"Error converting {file}: {str(e)}")
 print("\nAll conversions completed!")
 ```
 2. Place the script in the same directory as your files
 3. Install required packages: like openai
 4. Run script ```bash python convert.py ```
 Note that original files will remain unchanged and new markdown files are created with the same base name.
 </details>
 ## Contributing
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
@ -65,21 +123,37 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
 contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
-### Running Tests
+### How to Contribute
-To run the tests for this project, use the following command:
+You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
 <div align="center">
 |                       | All                                      | Especially Needs Help from Community                                                                 |
 |-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
 | **Issues**            | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
 | **PRs**               | [All PRs](https://github.com/microsoft/markitdown/pulls)     | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22)               |
 </div>
 ### Running Tests and Checks
 - Install `hatch` in your environment and run tests:
    ```sh
    pip install hatch  # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
    hatch shell
    hatch test
    ```
-### Running Pre-commit Checks
+  (Alternative) Use the Devcontainer which has all the dependencies installed:
    ```sh
-pre-commit run --all-files
+    # Reopen the project in Devcontainer and run:
    hatch test
    ```
 - Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
 ## Trademarks
 This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "markitdown"
 dynamic = ["version"]
-description = ''
+description = 'Utility tool for converting various files to Markdown'
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
@ -38,6 +38,8 @@ dependencies = [
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
  "charset-normalizer",
  "openai",
 ]
 [project.urls]
@ -76,3 +78,6 @@ exclude_lines = [
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
 [tool.hatch.build.targets.sdist]
 only-include = ["src/markitdown"]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a1"
+__version__ = "0.0.1a3"
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@ -1,21 +1,19 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
 import argparse
 import sys
-from ._markitdown import MarkItDown
+from textwrap import dedent
 from .__about__ import __version__
 from ._markitdown import MarkItDown, DocumentConverterResult
 def main():
-    if len(sys.argv) == 1:
+    parser = argparse.ArgumentParser(
-        markitdown = MarkItDown()
+        description="Convert various file formats to markdown.",
-        result = markitdown.convert_stream(sys.stdin.buffer)
+        prog="markitdown",
-        print(result.text_content)
+        formatter_class=argparse.RawDescriptionHelpFormatter,
-    elif len(sys.argv) == 2:
+        usage=dedent(
        markitdown = MarkItDown()
        result = markitdown.convert(sys.argv[1])
        print(result.text_content)
    else:
        sys.stderr.write(
            """
            SYNTAX:
@ -33,10 +31,52 @@ EXAMPLE:
                OR
                markitdown < example.pdf
-""".strip()
+                
-            + "\n"
+                OR to save to a file use
                markitdown example.pdf -o example.md
                OR
                markitdown example.pdf > example.md
            """
        ).strip(),
    )
    parser.add_argument(
        "-v",
        "--version",
        action="version",
        version=f"%(prog)s {__version__}",
        help="show the version number and exit",
    )
    parser.add_argument("filename", nargs="?")
    parser.add_argument(
        "-o",
        "--output",
        help="Output file name. If not provided, output is written to stdout.",
    )
    args = parser.parse_args()
    if args.filename is None:
        markitdown = MarkItDown()
        result = markitdown.convert_stream(sys.stdin.buffer)
        _handle_output(args, result)
    else:
        markitdown = MarkItDown()
        result = markitdown.convert(args.filename)
        _handle_output(args, result)
 def _handle_output(args, result: DocumentConverterResult):
    """Handle output to stdout or file"""
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(result.text_content)
    else:
        print(result.text_content)
 if __name__ == "__main__":
    main()
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -12,8 +12,12 @@ import subprocess
 import sys
 import tempfile
 import traceback
 import zipfile
 from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
 from pathlib import Path
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from warnings import warn, resetwarnings, catch_warnings
 import mammoth
 import markdownify
@ -26,15 +30,24 @@ import pptx
 import puremagic
 import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path
 # Optional Transcription support
 try:
    # Using warnings' catch_warnings to catch
    # pydub's warning of ffmpeg or avconv missing
    with catch_warnings(record=True) as w:
        import pydub
        if w:
            raise ModuleNotFoundError
    import speech_recognition as sr
    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
 except ModuleNotFoundError:
    pass
 finally:
    resetwarnings()
 # Optional YouTube transcription support
 try:
@ -161,9 +174,7 @@ class PlainTextConverter(DocumentConverter):
        elif "text/" not in content_type.lower():
            return None
-        text_content = ""
+        text_content = str(from_path(local_path).best())
        with open(local_path, "rt", encoding="utf-8") as fh:
            text_content = fh.read()
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
@ -213,6 +224,143 @@ class HtmlConverter(DocumentConverter):
        )
 class RSSConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""
    def convert(
        self, local_path: str, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not RSS type
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".xml", ".rss", ".atom"]:
            return None
        try:
            doc = minidom.parse(local_path)
        except BaseException as _:
            return None
        result = None
        if doc.getElementsByTagName("rss"):
            # A RSS feed must have a root element of <rss>
            result = self._parse_rss_type(doc)
        elif doc.getElementsByTagName("feed"):
            root = doc.getElementsByTagName("feed")[0]
            if root.getElementsByTagName("entry"):
                # An Atom feed must have a root element of <feed> and at least one <entry>
                result = self._parse_atom_type(doc)
            else:
                return None
        else:
            # not rss or atom
            return None
        return result
    def _parse_atom_type(
        self, doc: minidom.Document
    ) -> Union[None, DocumentConverterResult]:
        """Parse the type of an Atom feed.
        Returns None if the feed type is not recognized or something goes wrong.
        """
        try:
            root = doc.getElementsByTagName("feed")[0]
            title = self._get_data_by_tag_name(root, "title")
            subtitle = self._get_data_by_tag_name(root, "subtitle")
            entries = root.getElementsByTagName("entry")
            md_text = f"# {title}\n"
            if subtitle:
                md_text += f"{subtitle}\n"
            for entry in entries:
                entry_title = self._get_data_by_tag_name(entry, "title")
                entry_summary = self._get_data_by_tag_name(entry, "summary")
                entry_updated = self._get_data_by_tag_name(entry, "updated")
                entry_content = self._get_data_by_tag_name(entry, "content")
                if entry_title:
                    md_text += f"\n## {entry_title}\n"
                if entry_updated:
                    md_text += f"Updated on: {entry_updated}\n"
                if entry_summary:
                    md_text += self._parse_content(entry_summary)
                if entry_content:
                    md_text += self._parse_content(entry_content)
            return DocumentConverterResult(
                title=title,
                text_content=md_text,
            )
        except BaseException as _:
            return None
    def _parse_rss_type(
        self, doc: minidom.Document
    ) -> Union[None, DocumentConverterResult]:
        """Parse the type of an RSS feed.
        Returns None if the feed type is not recognized or something goes wrong.
        """
        try:
            root = doc.getElementsByTagName("rss")[0]
            channel = root.getElementsByTagName("channel")
            if not channel:
                return None
            channel = channel[0]
            channel_title = self._get_data_by_tag_name(channel, "title")
            channel_description = self._get_data_by_tag_name(channel, "description")
            items = channel.getElementsByTagName("item")
            if channel_title:
                md_text = f"# {channel_title}\n"
            if channel_description:
                md_text += f"{channel_description}\n"
            if not items:
                items = []
            for item in items:
                title = self._get_data_by_tag_name(item, "title")
                description = self._get_data_by_tag_name(item, "description")
                pubDate = self._get_data_by_tag_name(item, "pubDate")
                content = self._get_data_by_tag_name(item, "content:encoded")
                if title:
                    md_text += f"\n## {title}\n"
                if pubDate:
                    md_text += f"Published on: {pubDate}\n"
                if description:
                    md_text += self._parse_content(description)
                if content:
                    md_text += self._parse_content(content)
            return DocumentConverterResult(
                title=channel_title,
                text_content=md_text,
            )
        except BaseException as _:
            print(traceback.format_exc())
            return None
    def _parse_content(self, content: str) -> str:
        """Parse the content of an RSS feed item"""
        try:
            # using bs4 because many RSS feeds have HTML-styled content
            soup = BeautifulSoup(content, "html.parser")
            return _CustomMarkdownify().convert_soup(soup)
        except BaseException as _:
            return content
    def _get_data_by_tag_name(
        self, element: minidom.Element, tag_name: str
    ) -> Union[str, None]:
        """Get data from first child element with the given tag name.
        Returns None when no such element is found.
        """
        nodes = element.getElementsByTagName(tag_name)
        if not nodes:
            return None
        fc = nodes[0].firstChild
        if fc:
            return fc.data
        return None
 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""
@ -344,8 +492,11 @@ class YouTubeConverter(DocumentConverter):
                assert isinstance(params["v"][0], str)
                video_id = str(params["v"][0])
                try:
                    youtube_transcript_languages = kwargs.get(
                        "youtube_transcript_languages", ("en",)
                    )
                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id)  # type: ignore
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
@ -391,6 +542,67 @@ class YouTubeConverter(DocumentConverter):
        return None
 class IpynbConverter(DocumentConverter):
    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not ipynb
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".ipynb":
            return None
        # Parse and convert the notebook
        result = None
        with open(local_path, "rt", encoding="utf-8") as fh:
            notebook_content = json.load(fh)
            result = self._convert(notebook_content)
        return result
    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
        """Helper function that converts notebook JSON content to Markdown."""
        try:
            md_output = []
            title = None
            for cell in notebook_content.get("cells", []):
                cell_type = cell.get("cell_type", "")
                source_lines = cell.get("source", [])
                if cell_type == "markdown":
                    md_output.append("".join(source_lines))
                    # Extract the first # heading as title if not already found
                    if title is None:
                        for line in source_lines:
                            if line.startswith("# "):
                                title = line.lstrip("# ").strip()
                                break
                elif cell_type == "code":
                    # Code cells are wrapped in Markdown code blocks
                    md_output.append(f"```python\n{''.join(source_lines)}\n```")
                elif cell_type == "raw":
                    md_output.append(f"```\n{''.join(source_lines)}\n```")
            md_text = "\n\n".join(md_output)
            # Check for title in notebook metadata
            title = notebook_content.get("metadata", {}).get("title", title)
            return DocumentConverterResult(
                title=title,
                text_content=md_text,
            )
        except Exception as e:
            raise FileConversionException(
                f"Error converting .ipynb file: {str(e)}"
            ) from e
 class BingSerpConverter(DocumentConverter):
    """
    Handle Bing results pages (only the organic search results).
@ -492,7 +704,9 @@ class DocxConverter(HtmlConverter):
        result = None
        with open(local_path, "rb") as docx_file:
-            result = mammoth.convert_to_html(docx_file)
+            style_map = kwargs.get("style_map", None)
            result = mammoth.convert_to_html(docx_file, style_map=style_map)
            html_content = result.value
            result = self._convert(html_content)
@ -582,6 +796,10 @@ class PptxConverter(HtmlConverter):
                        "\n" + self._convert(html_table).text_content.strip() + "\n"
                    )
                # Charts
                if shape.has_chart:
                    md_content += self._convert_chart_to_markdown(shape.chart)
                # Text areas
                elif shape.has_text_frame:
                    if shape == title:
@ -616,6 +834,29 @@ class PptxConverter(HtmlConverter):
            return True
        return False
    def _convert_chart_to_markdown(self, chart):
        md = "\n\n### Chart"
        if chart.has_title:
            md += f": {chart.chart_title.text_frame.text}"
        md += "\n\n"
        data = []
        category_names = [c.label for c in chart.plots[0].categories]
        series_names = [s.name for s in chart.series]
        data.append(["Category"] + series_names)
        for idx, category in enumerate(category_names):
            row = [category]
            for series in chart.series:
                row.append(series.values[idx])
            data.append(row)
        markdown_table = []
        for row in data:
            markdown_table.append("| " + " | ".join(map(str, row)) + " |")
        header = markdown_table[0]
        separator = "|" + "|".join(["---"] * len(data[0])) + "|"
        return md + "\n".join([header, separator] + markdown_table[1:])
 class MediaConverter(DocumentConverter):
    """
@ -642,7 +883,7 @@ class WavConverter(MediaConverter):
    """
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLSX
+        # Bail if not a WAV
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".wav":
            return None
@ -754,11 +995,11 @@ class Mp3Converter(WavConverter):
 class ImageConverter(MediaConverter):
    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLSX
+        # Bail if not an image
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
            return None
@ -784,17 +1025,17 @@ class ImageConverter(MediaConverter):
                    md_content += f"{f}: {metadata[f]}\n"
        # Try describing the image with GPTV
-        mlm_client = kwargs.get("mlm_client")
+        llm_client = kwargs.get("llm_client")
-        mlm_model = kwargs.get("mlm_model")
+        llm_model = kwargs.get("llm_model")
-        if mlm_client is not None and mlm_model is not None:
+        if llm_client is not None and llm_model is not None:
            md_content += (
                "\n# Description:\n"
-                + self._get_mlm_description(
+                + self._get_llm_description(
                    local_path,
                    extension,
-                    mlm_client,
+                    llm_client,
-                    mlm_model,
+                    llm_model,
-                    prompt=kwargs.get("mlm_prompt"),
+                    prompt=kwargs.get("llm_prompt"),
                ).strip()
                + "\n"
            )
@ -818,12 +1059,10 @@ class ImageConverter(MediaConverter):
            text_content=md_content,
        )
-    def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
        data_uri = ""
        with open(local_path, "rb") as image_file:
            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
@ -851,6 +1090,135 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content
 class ZipConverter(DocumentConverter):
    """Converts ZIP files to markdown by extracting and converting all contained files.
    The converter extracts the ZIP contents to a temporary directory, processes each file
    using appropriate converters based on file extensions, and then combines the results
    into a single markdown document. The temporary directory is cleaned up after processing.
    Example output format:
    ```markdown
    Content from the zip file `example.zip`:
    ## File: docs/readme.txt
    This is the content of readme.txt
    Multiple lines are preserved
    ## File: images/example.jpg
    ImageSize: 1920x1080
    DateTimeOriginal: 2024-02-15 14:30:00
    Description: A beautiful landscape photo
    ## File: data/report.xlsx
    ## Sheet1
    | Column1 | Column2 | Column3 |
    |---------|---------|---------|
    | data1   | data2   | data3   |
    | data4   | data5   | data6   |
    ```
    Key features:
    - Maintains original file structure in headings
    - Processes nested files recursively
    - Uses appropriate converters for each file type
    - Preserves formatting of converted content
    - Cleans up temporary files after processing
    """
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a ZIP
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".zip":
            return None
        # Get parent converters list if available
        parent_converters = kwargs.get("_parent_converters", [])
        if not parent_converters:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
            )
        extracted_zip_folder_name = (
            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
        )
        extraction_dir = os.path.normpath(
            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
        )
        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
        try:
            # Extract the zip file safely
            with zipfile.ZipFile(local_path, "r") as zipObj:
                # Safeguard against path traversal
                for member in zipObj.namelist():
                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
                    if (
                        not os.path.commonprefix([extraction_dir, member_path])
                        == extraction_dir
                    ):
                        raise ValueError(
                            f"Path traversal detected in zip file: {member}"
                        )
                # Extract all files safely
                zipObj.extractall(path=extraction_dir)
            # Process each extracted file
            for root, dirs, files in os.walk(extraction_dir):
                for name in files:
                    file_path = os.path.join(root, name)
                    relative_path = os.path.relpath(file_path, extraction_dir)
                    # Get file extension
                    _, file_extension = os.path.splitext(name)
                    # Update kwargs for the file
                    file_kwargs = kwargs.copy()
                    file_kwargs["file_extension"] = file_extension
                    file_kwargs["_parent_converters"] = parent_converters
                    # Try converting the file using available converters
                    for converter in parent_converters:
                        # Skip the zip converter to avoid infinite recursion
                        if isinstance(converter, ZipConverter):
                            continue
                        result = converter.convert(file_path, **file_kwargs)
                        if result is not None:
                            md_content += f"\n## File: {relative_path}\n\n"
                            md_content += result.text_content + "\n\n"
                            break
            # Clean up extracted files if specified
            if kwargs.get("cleanup_extracted", True):
                shutil.rmtree(extraction_dir)
            return DocumentConverterResult(title=None, text_content=md_content.strip())
        except zipfile.BadZipFile:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
            )
        except ValueError as ve:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
            )
        except Exception as e:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
            )
 class FileConversionException(BaseException):
    pass
@ -866,16 +1234,50 @@ class MarkItDown:
    def __init__(
        self,
        requests_session: Optional[requests.Session] = None,
        llm_client: Optional[Any] = None,
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
-        mlm_model: Optional[Any] = None,
+        mlm_model: Optional[str] = None,
    ):
        if requests_session is None:
            self._requests_session = requests.Session()
        else:
            self._requests_session = requests_session
-        self._mlm_client = mlm_client
+        # Handle deprecation notices
-        self._mlm_model = mlm_model
+        #############################
        if mlm_client is not None:
            if llm_client is None:
                warn(
                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
                    DeprecationWarning,
                )
                llm_client = mlm_client
                mlm_client = None
            else:
                raise ValueError(
                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
                )
        if mlm_model is not None:
            if llm_model is None:
                warn(
                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
                    DeprecationWarning,
                )
                llm_model = mlm_model
                mlm_model = None
            else:
                raise ValueError(
                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
                )
        #############################
        self._llm_client = llm_client
        self._llm_model = llm_model
        self._style_map = style_map
        self._page_converters: List[DocumentConverter] = []
@ -884,6 +1286,7 @@ class MarkItDown:
        # To this end, the most specific converters should appear below the most generic converters
        self.register_page_converter(PlainTextConverter())
        self.register_page_converter(HtmlConverter())
        self.register_page_converter(RSSConverter())
        self.register_page_converter(WikipediaConverter())
        self.register_page_converter(YouTubeConverter())
        self.register_page_converter(BingSerpConverter())
@ -893,14 +1296,16 @@ class MarkItDown:
        self.register_page_converter(WavConverter())
        self.register_page_converter(Mp3Converter())
        self.register_page_converter(ImageConverter())
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
    def convert(
-        self, source: Union[str, requests.Response], **kwargs: Any
+        self, source: Union[str, requests.Response, Path], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        """
        Args:
-            - source: can be a string representing a path or url, or a requests.response object
+            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
        """
@ -917,10 +1322,14 @@ class MarkItDown:
        # Request response
        elif isinstance(source, requests.Response):
            return self.convert_response(source, **kwargs)
        elif isinstance(source, Path):
            return self.convert_local(source, **kwargs)
    def convert_local(
-        self, path: str, **kwargs: Any
+        self, path: Union[str, Path], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        if isinstance(path, Path):
            path = str(path)
        # Prepare a list of extensions to try (in order of priority)
        ext = kwargs.get("file_extension")
        extensions = [ext] if ext is not None else []
@ -1017,7 +1426,7 @@ class MarkItDown:
                self._append_ext(extensions, g)
            # Convert
-            result = self._convert(temp_path, extensions, url=response.url)
+            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
            try:
@ -1044,11 +1453,17 @@ class MarkItDown:
                    _kwargs.update({"file_extension": ext})
                # Copy any additional global options
-                if "mlm_client" not in _kwargs and self._mlm_client is not None:
+                if "llm_client" not in _kwargs and self._llm_client is not None:
-                    _kwargs["mlm_client"] = self._mlm_client
+                    _kwargs["llm_client"] = self._llm_client
-                if "mlm_model" not in _kwargs and self._mlm_model is not None:
+                if "llm_model" not in _kwargs and self._llm_model is not None:
-                    _kwargs["mlm_model"] = self._mlm_model
+                    _kwargs["llm_model"] = self._llm_model
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._page_converters
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map
                # If we hit an error log it and keep trying
                try:
@ -1085,7 +1500,6 @@ class MarkItDown:
        if ext == "":
            return
        # if ext not in extensions:
        if True:
        extensions.append(ext)
    def _guess_ext_magic(self, path):
--- a/src/markitdown/py.typed
+++ b/src/markitdown/py.typed
--- a/tests/test_files/test.docx
+++ b/tests/test_files/test.docx
--- a/tests/test_files/test.jpg
+++ b/tests/test_files/test.jpg
--- a/tests/test_files/test.pptx
+++ b/tests/test_files/test.pptx
--- a/tests/test_files/test.xlsx
+++ b/tests/test_files/test.xlsx
--- a/tests/test_files/test_files.zip
+++ b/tests/test_files/test_files.zip
--- a/tests/test_files/test_llm.jpg
+++ b/tests/test_files/test_llm.jpg
--- a/tests/test_files/test_mskanji.csv
+++ b/tests/test_files/test_mskanji.csv
@ -0,0 +1,4 @@
 –¼‘O,”N—î,<2C>Z<EFBFBD>Š
 <EFBFBD>²“¡‘¾˜Y,30,“Œ‹ž
 ŽO–Ø‰pŽq,25,‘å<E28098>ã
 îà‹´<EFBFBD>~,35,–¼ŒÃ‰®
--- a/tests/test_files/test_notebook.ipynb
+++ b/tests/test_files/test_notebook.ipynb
@ -0,0 +1,89 @@
 {
    "cells": [
        {
            "cell_type": "markdown",
            "id": "0f61db80",
            "metadata": {},
            "source": [
                "# Test Notebook"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 11,
            "id": "3f2a5bbd",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "markitdown\n"
                    ]
                }
            ],
            "source": [
                "print('markitdown')"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "9b9c0468",
            "metadata": {},
            "source": [
                "## Code Cell Below"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "id": "37d8088a",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "42\n"
                    ]
                }
            ],
            "source": [
                "# comment in code\n",
                "print(42)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "2e3177bd",
            "metadata": {},
            "source": [
                "End\n",
                "\n",
                "---"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.12.8"
        },
        "title": "Test Notebook Title"
    },
    "nbformat": 4,
    "nbformat_minor": 5
 }
--- a/tests/test_files/test_rss.xml
+++ b/tests/test_files/test_rss.xml
--- a/tests/test_files/test_with_comment.docx
+++ b/tests/test_files/test_with_comment.docx
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -6,11 +6,23 @@ import shutil
 import pytest
 import requests
 from warnings import catch_warnings, resetwarnings
 from markitdown import MarkItDown
 skip_remote = (
    True if os.environ.get("GITHUB_ACTIONS") else False
 )  # Don't run these tests in CI
 # Don't run the llm tests without a key and the client library
 skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
 try:
    import openai
 except ModuleNotFoundError:
    skip_llm = True
 # Skip exiftool tests if not installed
 skip_exiftool = shutil.which("exiftool") is None
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@ -51,12 +63,25 @@ DOCX_TEST_STRINGS = [
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 ]
 DOCX_COMMENT_TEST_STRINGS = [
    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
    "49e168b7-d2ae-407f-a055-2167576f39a1",
    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
    "# Abstract",
    "# Introduction",
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    "This is a test comment. 12df-321a",
    "Yet another comment in the doc. 55yiyi-asd09",
 ]
 PPTX_TEST_STRINGS = [
    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
    "1b92870d-e3b5-4e65-8153-919f4ff45592",
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
    "2003",  # chart value
 ]
 BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@ -65,6 +90,13 @@ BLOG_TEST_STRINGS = [
    "an example where high cost can easily prevent a generic complex",
 ]
 RSS_TEST_STRINGS = [
    "The Official Microsoft Blog",
    "In the case of AI, it is absolutely true that the industry is moving incredibly fast",
 ]
 WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
 WIKIPEDIA_TEST_STRINGS = [
    "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@ -87,6 +119,28 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]
 CSV_CP932_TEST_STRINGS = [
    "名前,年齢,住所",
    "佐藤太郎,30,東京",
    "三木英子,25,大阪",
    "髙橋淳,35,名古屋",
 ]
 LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]
 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
    """Validate presence or absence of specific strings."""
    text_content = result.text_content.replace("\\", "")
    for string in expected_strings:
        assert string in text_content
    if exclude_strings:
        for string in exclude_strings:
            assert string not in text_content
@pytest.mark.skipif(
    skip_remote,
@ -120,50 +174,64 @@ def test_markitdown_local() -> None:
    # Test XLSX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
-    for test_string in XLSX_TEST_STRINGS:
+    validate_strings(result, XLSX_TEST_STRINGS)
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test DOCX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
-    for test_string in DOCX_TEST_STRINGS:
+    validate_strings(result, DOCX_TEST_STRINGS)
-        text_content = result.text_content.replace("\\", "")
+
-        assert test_string in text_content
+    # Test DOCX processing, with comments
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
        style_map="comment-reference => ",
    )
    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
    # Test DOCX processing, with comments and setting style_map on init
    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
    result = markitdown_with_style_map.convert(
        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
    )
    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
    # Test PPTX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
-    for test_string in PPTX_TEST_STRINGS:
+    validate_strings(result, PPTX_TEST_STRINGS)
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test HTML processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
    )
-    for test_string in BLOG_TEST_STRINGS:
+    validate_strings(result, BLOG_TEST_STRINGS)
-        text_content = result.text_content.replace("\\", "")
+
-        assert test_string in text_content
+    # Test ZIP file processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
    validate_strings(result, XLSX_TEST_STRINGS)
    # Test Wikipedia processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
    )
    text_content = result.text_content.replace("\\", "")
-    for test_string in WIKIPEDIA_TEST_EXCLUDES:
+    validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
        assert test_string not in text_content
    for test_string in WIKIPEDIA_TEST_STRINGS:
        assert test_string in text_content
    # Test Bing processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
    )
    text_content = result.text_content.replace("\\", "")
-    for test_string in SERP_TEST_EXCLUDES:
+    validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
-        assert test_string not in text_content
+
-    for test_string in SERP_TEST_STRINGS:
+    # Test RSS processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
    text_content = result.text_content.replace("\\", "")
    for test_string in RSS_TEST_STRINGS:
        assert test_string in text_content
    ## Test non-UTF-8 encoding
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
    validate_strings(result, CSV_CP932_TEST_STRINGS)
@pytest.mark.skipif(
    skip_exiftool,
@ -179,8 +247,63 @@ def test_markitdown_exiftool() -> None:
        assert target in result.text_content
 def test_markitdown_deprecation() -> None:
    try:
        with catch_warnings(record=True) as w:
            test_client = object()
            markitdown = MarkItDown(mlm_client=test_client)
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert markitdown._llm_client == test_client
    finally:
        resetwarnings()
    try:
        with catch_warnings(record=True) as w:
            markitdown = MarkItDown(mlm_model="gpt-4o")
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert markitdown._llm_model == "gpt-4o"
    finally:
        resetwarnings()
    try:
        test_client = object()
        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
        assert False
    except ValueError:
        pass
    try:
        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
        assert False
    except ValueError:
        pass
@pytest.mark.skipif(
    skip_llm,
    reason="do not run llm tests without a key",
 )
 def test_markitdown_llm() -> None:
    client = openai.OpenAI()
    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
    for test_string in LLM_TEST_STRINGS:
        assert test_string in result.text_content
    # This is not super precise. It would also accept "red square", "blue circle",
    # "the square is not blue", etc. But it's sufficient for this test.
    for test_string in ["red", "circle", "blue", "square"]:
        assert test_string in result.text_content.lower()
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    test_markitdown_remote()
    test_markitdown_local()
    test_markitdown_exiftool()
    test_markitdown_deprecation()
    test_markitdown_llm()