Most converters are now working.

2025-03-05 00:24:54 -08:00 · 2025-03-05 00:24:54 -08:00 · c426cb81b3
commit c426cb81b3
parent 4a034da269
15 changed files with 422 additions and 286 deletions
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -41,7 +41,7 @@ class DocumentConverterResult:
        self.markdown = markdown
    def __str__(self) -> str:
-        """Return the Markdown content."""
+        """Return the converted Markdown text."""
        return self.markdown
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -130,7 +130,7 @@ class MarkItDown:
            # Later registrations are tried first / take higher priority than earlier registrations
            # To this end, the most specific converters should appear below the most generic converters
            self.register_converter(PlainTextConverter())
-            self.register_converter(ZipConverter())
+            self.register_converter(ZipConverter(markitdown=self))
            self.register_converter(HtmlConverter())
            self.register_converter(RssConverter())
            self.register_converter(WikipediaConverter())
@ -464,16 +464,16 @@ class MarkItDown:
                # Attempt the conversion
                if _accepts:
-                    # try:
+                    try:
-                    res = converter.convert(file_stream, stream_info, **_kwargs)
+                        res = converter.convert(file_stream, stream_info, **_kwargs)
-                    # except Exception:
+                    except Exception:
-                    #    failed_attempts.append(
+                        failed_attempts.append(
-                    #        FailedConversionAttempt(
+                            FailedConversionAttempt(
-                    #            converter=converter, exc_info=sys.exc_info()
+                                converter=converter, exc_info=sys.exc_info()
-                    #        )
+                            )
-                    #    )
+                        )
-                    # finally:
+                    finally:
-                    file_stream.seek(cur_pos)
+                        file_stream.seek(cur_pos)
                if res is not None:
                    # Normalize the content
--- a/packages/markitdown/src/markitdown/_stream_info.py
+++ b/packages/markitdown/src/markitdown/_stream_info.py
@ -1,4 +1,6 @@
 import puremagic
 import mimetypes
 import os
 from dataclasses import dataclass, asdict
 from typing import Optional, BinaryIO, List, TypeVar, Type
@ -56,6 +58,18 @@ class StreamInfo:
        """
        guesses: List[StreamInfo] = []
        # Add a guess purely based on the filename hint
        if filename_hint:
            try:
                mimetype, _ = mimetypes.guess_file_type(filename_hint)
            except AttributeError:
                mimetype, _ = mimetypes.guess_type(filename_hint)
            if mimetype:
                guesses.append(
                    cls(mimetype=mimetype, extension=os.path.splitext(filename_hint)[1])
                )
        def _puremagic(
            file_stream, filename_hint
        ) -> puremagic.main.PureMagicWithConfidence:
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -41,7 +41,7 @@ class BingSerpConverter(DocumentConverter):
        Make sure we're dealing with HTML content *from* Bing.
        """
-        url = (stream_info.url or "").lower()
+        url = stream_info.url or ""
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,4 +1,4 @@
-from typing import BinaryIO, Any
+from typing import BinaryIO, Any, Union
 import base64
 import mimetypes
 from ._exiftool import exiftool_metadata
@ -71,53 +71,73 @@ class ImageConverter(DocumentConverter):
                if f in metadata:
                    md_content += f"{f}: {metadata[f]}\n"
-        #        # Try describing the image with GPTV
+        # Try describing the image with GPT
-        #        llm_client = kwargs.get("llm_client")
+        llm_client = kwargs.get("llm_client")
-        #        llm_model = kwargs.get("llm_model")
+        llm_model = kwargs.get("llm_model")
-        #        if llm_client is not None and llm_model is not None:
+        if llm_client is not None and llm_model is not None:
-        #            md_content += (
+            md_content += (
-        #                "\n# Description:\n"
+                "\n# Description:\n"
-        #                + self._get_llm_description(
+                + self._get_llm_description(
-        #                    local_path,
+                    file_stream,
-        #                    extension,
+                    stream_info,
-        #                    llm_client,
+                    client=llm_client,
-        #                    llm_model,
+                    model=llm_model,
-        #                    prompt=kwargs.get("llm_prompt"),
+                    prompt=kwargs.get("llm_prompt"),
-        #                ).strip()
+                ).strip()
-        #                + "\n"
+                + "\n"
-        #            )
+            )
        return DocumentConverterResult(
            markdown=md_content,
        )
    def _get_llm_description(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        *,
        client,
        model,
        prompt=None,
    ) -> Union[None, str]:
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
-#    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
+        # Get the content type
-#        if prompt is None or prompt.strip() == "":
+        content_type = stream_info.mimetype
-#            prompt = "Write a detailed caption for this image."
+        if not content_type:
-#
+            content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
-#        data_uri = ""
+        if not content_type:
-#        with open(local_path, "rb") as image_file:
+            content_type = "application/octet-stream"
-#            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+
-#            if content_type is None:
+        # Convert to base64
-#                content_type = "image/jpeg"
+        cur_pos = file_stream.tell()
-#            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+        try:
-#            data_uri = f"data:{content_type};base64,{image_base64}"
+            base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
-#
+        except Exception as e:
-#        messages = [
+            return None
-#            {
+        finally:
-#                "role": "user",
+            file_stream.seek(cur_pos)
-#                "content": [
+
-#                    {"type": "text", "text": prompt},
+        # Prepare the data-uri
-#                    {
+        data_uri = f"data:{content_type};base64,{base64_image}"
-#                        "type": "image_url",
+
-#                        "image_url": {
+        # Prepare the OpenAI API request
-#                            "url": data_uri,
+        messages = [
-#                        },
+            {
-#                    },
+                "role": "user",
-#                ],
+                "content": [
-#            }
+                    {"type": "text", "text": prompt},
-#        ]
+                    {
-#
+                        "type": "image_url",
-#        response = client.chat.completions.create(model=model, messages=messages)
+                        "image_url": {
-#        return response.choices[0].message.content
+                            "url": data_uri,
                        },
                    },
                ],
            }
        ]
        # Call the OpenAI API
        response = client.chat.completions.create(model=model, messages=messages)
        return response.choices[0].message.content
--- a/packages/markitdown/src/markitdown/converters/_llm_caption.py
+++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py
@ -0,0 +1,50 @@
 from typing import BinaryIO, Any, Union
 import base64
 import mimetypes
 from .._stream_info import StreamInfo
 def llm_caption(
    file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
 ) -> Union[None, str]:
    if prompt is None or prompt.strip() == "":
        prompt = "Write a detailed caption for this image."
    # Get the content type
    content_type = stream_info.mimetype
    if not content_type:
        content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension)
    if not content_type:
        content_type = "application/octet-stream"
    # Convert to base64
    cur_pos = file_stream.tell()
    try:
        base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
    except Exception as e:
        return None
    finally:
        file_stream.seek(cur_pos)
    # Prepare the data-uri
    data_uri = f"data:{content_type};base64,{base64_image}"
    # Prepare the OpenAI API request
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": data_uri,
                    },
                },
            ],
        }
    ]
    # Call the OpenAI API
    response = client.chat.completions.create(model=model, messages=messages)
    return response.choices[0].message.content
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -1,8 +1,13 @@
 import sys
-from typing import Union
+
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
@ -14,6 +19,14 @@ except ImportError:
    _dependency_exc_info = sys.exc_info()
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "application/pdf",
    "application/x-pdf",
 ]
 ACCEPTED_FILE_EXTENSIONS = [".pdf"]
 class PdfConverter(DocumentConverter):
    """
    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
@ -24,12 +37,30 @@ class PdfConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def accepts(
-        # Bail if not a PDF
+        self,
-        extension = kwargs.get("file_extension", "")
+        file_stream: BinaryIO,
-        if extension.lower() != ".pdf":
+        stream_info: StreamInfo,
-            return None
+        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Check the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
@ -43,5 +74,5 @@ class PdfConverter(DocumentConverter):
            )  # Restore the original traceback
        return DocumentConverterResult(
-            markdown=pdfminer.high_level.extract_text(local_path)
+            markdown=pdfminer.high_level.extract_text(file_stream),
        )
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -1,13 +1,26 @@
-import mimetypes
+import sys
 from charset_normalizer import from_path
 from typing import Any, Union
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
    import mammoth
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/",
    "application/json",
 ]
 # Mimetypes to ignore (commonly confused extensions)
-IGNORE_MIMETYPES = [
+IGNORE_MIME_TYPE_PREFIXES = [
    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
 ]
@ -21,26 +34,34 @@ class PlainTextConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        for prefix in IGNORE_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return False
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
-        self, local_path: str, **kwargs: Any
+        self,
-    ) -> Union[None, DocumentConverterResult]:
+        file_stream: BinaryIO,
-        # Guess the content type from any file extension that might be around
+        stream_info: StreamInfo,
-        content_type, _ = mimetypes.guess_type(
+        **kwargs: Any,  # Options to pass to the converter
-            "__placeholder" + kwargs.get("file_extension", "")
+    ) -> DocumentConverterResult:
-        )
+        if stream_info.charset:
            text_content = file_stream.read().decode(stream_info.charset)
        else:
            text_content = str(from_bytes(file_stream.read()).best())
        # Ignore common false positives
        if content_type in IGNORE_MIMETYPES:
            content_type = None
        # Only accept text files
        if content_type is None:
            return None
        elif all(
            not content_type.lower().startswith(type_prefix)
            for type_prefix in ["text/", "application/json"]
        ):
            return None
        text_content = str(from_path(local_path).best())
        return DocumentConverterResult(markdown=text_content)
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -1,11 +1,14 @@
 import sys
 import base64
 import os
 import io
 import re
 import html
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from ._llm_caption import llm_caption
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -38,35 +41,6 @@ class PptxConverter(DocumentConverter):
        super().__init__(priority=priority)
        self._html_converter = HtmlConverter()
    def _get_llm_description(
        self, llm_client, llm_model, image_blob, content_type, prompt=None
    ):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed alt text for this image with less than 50 words."
        image_base64 = base64.b64encode(image_blob).decode("utf-8")
        data_uri = f"data:{content_type};base64,{image_base64}"
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": data_uri,
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        response = llm_client.chat.completions.create(
            model=llm_model, messages=messages
        )
        return response.choices[0].message.content
    def accepts(
        self,
        file_stream: BinaryIO,
@ -120,41 +94,54 @@ class PptxConverter(DocumentConverter):
                if self._is_picture(shape):
                    # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
-                    llm_description = None
+                    llm_description = ""
-                    alt_text = None
+                    alt_text = ""
                    # Potentially generate a description using an LLM
                    llm_client = kwargs.get("llm_client")
                    llm_model = kwargs.get("llm_model")
                    if llm_client is not None and llm_model is not None:
                        # Prepare a file_stream and stream_info for the image data
                        image_filename = shape.image.filename
                        image_extension = None
                        if image_filename:
                            image_extension = os.path.splitext(image_filename)[1]
                        image_stream_info = StreamInfo(
                            mimetype=shape.image.content_type,
                            extension=image_extension,
                            filename=image_filename,
                        )
                        image_stream = io.BytesIO(shape.image.blob)
                        # Caption the image
                        try:
-                            llm_description = self._get_llm_description(
+                            llm_description = llm_caption(
-                                llm_client,
+                                image_stream,
-                                llm_model,
+                                image_stream_info,
-                                shape.image.blob,
+                                client=llm_client,
-                                shape.image.content_type,
+                                model=llm_model,
                                prompt=kwargs.get("llm_prompt"),
                            )
                        except Exception:
-                            # Unable to describe with LLM
+                            # Unable to generate a description
                            pass
-                    if not llm_description:
+                    # Also grab any description embedded in the deck
-                        try:
+                    try:
-                            alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
-                                "descr", ""
+                    except Exception:
-                            )
+                        # Unable to get alt text
-                        except Exception:
+                        pass
-                            # Unable to get alt text
+
-                            pass
+                    # Prepare the alt, escaping any special characters
                    alt_text = "\n".join([llm_description, alt_text]) or shape.name
                    alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()
                    # A placeholder name
                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                    md_content += (
+                    md_content += "\n![" + alt_text + "](" + filename + ")\n"
                        "\n!["
                        + (llm_description or alt_text or shape.name)
                        + "]("
                        + filename
                        + ")\n"
                    )
                # Tables
                if self._is_table(shape):
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -36,7 +36,7 @@ class WikipediaConverter(DocumentConverter):
        Make sure we're dealing with HTML content *from* Wikipedia.
        """
-        url = (stream_info.url or "").lower()
+        url = stream_info.url or ""
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -1,14 +1,15 @@
-import re
+import sys
 import json
 import urllib.parse
 import time
-
+import io
-from typing import Any, Union, Dict, List
+import re
-from urllib.parse import parse_qs, urlparse
+from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote
 from bs4 import BeautifulSoup
 from .._base_converter import DocumentConverter, DocumentConverterResult
-
+from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify
 # Optional YouTube transcription support
 try:
@ -19,6 +20,17 @@ except ModuleNotFoundError:
    IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/html",
    "application/xhtml",
 ]
 ACCEPTED_FILE_EXTENSIONS = [
    ".html",
    ".htm",
 ]
 class YouTubeConverter(DocumentConverter):
    """Handle YouTube specially, focusing on the video title, description, and transcript."""
@ -27,45 +39,45 @@ class YouTubeConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
-    def retry_operation(self, operation, retries=3, delay=2):
+    def accepts(
-        """Retries the operation if it fails."""
+        self,
-        attempt = 0
+        file_stream: BinaryIO,
-        while attempt < retries:
+        stream_info: StreamInfo,
-            try:
+        **kwargs: Any,  # Options to pass to the converter
-                return operation()  # Attempt the operation
+    ) -> bool:
-            except Exception as e:
+        """
-                print(f"Attempt {attempt + 1} failed: {e}")
+        Make sure we're dealing with HTML content *from* YouTube.
-                if attempt < retries - 1:
+        """
-                    time.sleep(delay)  # Wait before retrying
+        url = stream_info.url or ""
-                attempt += 1
+        mimetype = (stream_info.mimetype or "").lower()
-        # If all attempts fail, raise the last exception
+        extension = (stream_info.extension or "").lower()
        raise Exception(f"Operation failed after {retries} attempts.")
-    def convert(
+        url = unquote(url)
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not YouTube
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".html", ".htm"]:
            return None
        url = kwargs.get("url", "")
        url = urllib.parse.unquote(url)
        url = url.replace(r"\?", "?").replace(r"\=", "=")
        if not url.startswith("https://www.youtube.com/watch?"):
-            return None
+            # Not a YouTube URL
            return False
-        # Parse the file with error handling
+        if extension in ACCEPTED_FILE_EXTENSIONS:
-        try:
+            return True
            with open(local_path, "rt", encoding="utf-8") as fh:
                soup = BeautifulSoup(fh.read(), "html.parser")
        except Exception as e:
            print(f"Error reading YouTube page: {e}")
            return None
-        if not soup.title or not soup.title.string:
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            return None
+            if mimetype.startswith(prefix):
                return True
        # Not HTML content
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
        # Read the meta tags
        metadata: Dict[str, str] = {"title": soup.title.string}
@ -126,7 +138,7 @@ class YouTubeConverter(DocumentConverter):
        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
            transcript_text = ""
-            parsed_url = urlparse(url)  # type: ignore
+            parsed_url = urlparse(stream_info.url)  # type: ignore
            params = parse_qs(parsed_url.query)  # type: ignore
            if "v" in params and params["v"][0]:
                video_id = str(params["v"][0])
@ -135,7 +147,7 @@ class YouTubeConverter(DocumentConverter):
                        "youtube_transcript_languages", ("en",)
                    )
                    # Retry the transcript fetching operation
-                    transcript = self.retry_operation(
+                    transcript = self._retry_operation(
                        lambda: YouTubeTranscriptApi.get_transcript(
                            video_id, languages=youtube_transcript_languages
                        ),
@ -188,3 +200,17 @@ class YouTubeConverter(DocumentConverter):
                if result := self._findKey(v, key):
                    return result
        return None
    def _retry_operation(self, operation, retries=3, delay=2):
        """Retries the operation if it fails."""
        attempt = 0
        while attempt < retries:
            try:
                return operation()  # Attempt the operation
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < retries - 1:
                    time.sleep(delay)  # Wait before retrying
                attempt += 1
        # If all attempts fail, raise the last exception
        raise Exception(f"Operation failed after {retries} attempts.")
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -1,9 +1,19 @@
-import os
+import sys
 import zipfile
-import shutil
+import io
-from typing import Any, Union
+import os
 from typing import BinaryIO, Any
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import UnsupportedFormatException, FileConversionException
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "application/zip",
 ]
 ACCEPTED_FILE_EXTENSIONS = [".zip"]
 class ZipConverter(DocumentConverter):
@ -46,95 +56,59 @@ class ZipConverter(DocumentConverter):
    """
    def __init__(
-        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+        self,
        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
        *,
        markitdown: Any,
    ):
        super().__init__(priority=priority)
        self._markitdown = markitdown
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
-        self, local_path: str, **kwargs: Any
+        self,
-    ) -> Union[None, DocumentConverterResult]:
+        file_stream: BinaryIO,
-        # Bail if not a ZIP
+        stream_info: StreamInfo,
-        extension = kwargs.get("file_extension", "")
+        **kwargs: Any,  # Options to pass to the converter
-        if extension.lower() != ".zip":
+    ) -> DocumentConverterResult:
-            return None
+        file_path = stream_info.url or stream_info.local_path or stream_info.file_name
        md_content = f"Content from the zip file `{file_path}`:\n\n"
-        # Get parent converters list if available
+        with zipfile.ZipFile(file_stream, "r") as zipObj:
-        parent_converters = kwargs.get("_parent_converters", [])
+            for name in zipObj.namelist():
-        if not parent_converters:
+                try:
-            return DocumentConverterResult(
+                    z_file_stream = io.BytesIO(zipObj.read(name))
-                markdown=f"[ERROR] No converters available to process zip contents from: {local_path}",
+                    z_file_stream_info = StreamInfo(
-            )
+                        extension=os.path.splitext(name)[1],
                        filename=os.path.basename(name),
                    )
                    result = self._markitdown.convert_stream(
                        stream=z_file_stream,
                        stream_info=z_file_stream_info,
                    )
                    if result is not None:
                        md_content += f"## File: {name}\n\n"
                        md_content += result.markdown + "\n\n"
                except UnsupportedFormatException:
                    pass
                except FileConversionException:
                    pass
-        extracted_zip_folder_name = (
+        return DocumentConverterResult(markdown=md_content.strip())
            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
        )
        extraction_dir = os.path.normpath(
            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
        )
        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
        try:
            # Extract the zip file safely
            with zipfile.ZipFile(local_path, "r") as zipObj:
                # Bail if we discover it's an Office OOXML file
                if "[Content_Types].xml" in zipObj.namelist():
                    return None
                # Safeguard against path traversal
                for member in zipObj.namelist():
                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
                    if (
                        not os.path.commonprefix([extraction_dir, member_path])
                        == extraction_dir
                    ):
                        raise ValueError(
                            f"Path traversal detected in zip file: {member}"
                        )
                # Extract all files safely
                zipObj.extractall(path=extraction_dir)
            # Process each extracted file
            for root, dirs, files in os.walk(extraction_dir):
                for name in files:
                    file_path = os.path.join(root, name)
                    relative_path = os.path.relpath(file_path, extraction_dir)
                    # Get file extension
                    _, file_extension = os.path.splitext(name)
                    # Update kwargs for the file
                    file_kwargs = kwargs.copy()
                    file_kwargs["file_extension"] = file_extension
                    file_kwargs["_parent_converters"] = parent_converters
                    # Try converting the file using available converters
                    for converter in parent_converters:
                        # Skip the zip converter to avoid infinite recursion
                        if isinstance(converter, ZipConverter):
                            continue
                        result = converter.convert(file_path, **file_kwargs)
                        if result is not None:
                            md_content += f"\n## File: {relative_path}\n\n"
                            md_content += result.markdown + "\n\n"
                            break
            # Clean up extracted files if specified
            if kwargs.get("cleanup_extracted", True):
                shutil.rmtree(extraction_dir)
            return DocumentConverterResult(markdown=md_content.strip())
        except zipfile.BadZipFile:
            return DocumentConverterResult(
                markdown=f"[ERROR] Invalid or corrupted zip file: {local_path}",
            )
        except ValueError as ve:
            return DocumentConverterResult(
                markdown=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
            )
        except Exception as e:
            return DocumentConverterResult(
                markdown=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
            )
--- a/packages/markitdown/tests/test_files/test.pdf
+++ b/packages/markitdown/tests/test_files/test.pdf
--- a/packages/markitdown/tests/test_files/test.pptx
+++ b/packages/markitdown/tests/test_files/test.pptx
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@ -2,6 +2,7 @@
 import io
 import os
 import shutil
 import openai
 import pytest
 import requests
@ -289,7 +290,6 @@ def test_markitdown_remote() -> None:
        assert test_string in result.text_content
    # Youtube
    # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
    result = markitdown.convert(YOUTUBE_TEST_URL)
    for test_string in YOUTUBE_TEST_STRINGS:
        assert test_string in result.text_content
@ -298,6 +298,10 @@ def test_markitdown_remote() -> None:
 def test_markitdown_local() -> None:
    markitdown = MarkItDown()
    # Test PDF processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pdf"))
    validate_strings(result, PDF_TEST_STRINGS)
    # Test XLSX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
    validate_strings(result, XLSX_TEST_STRINGS)
@ -336,10 +340,6 @@ def test_markitdown_local() -> None:
    )
    validate_strings(result, BLOG_TEST_STRINGS)
    # Test ZIP file processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
    validate_strings(result, XLSX_TEST_STRINGS)
    # Test Wikipedia processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
@ -360,18 +360,24 @@ def test_markitdown_local() -> None:
    for test_string in RSS_TEST_STRINGS:
        assert test_string in text_content
    ## Test non-UTF-8 encoding
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
    validate_strings(result, CSV_CP932_TEST_STRINGS)
    # Test MSG (Outlook email) processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
    validate_strings(result, MSG_TEST_STRINGS)
    # Test non-UTF-8 encoding
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
    validate_strings(result, CSV_CP932_TEST_STRINGS)
    # Test JSON processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
    validate_strings(result, JSON_TEST_STRINGS)
    # # Test ZIP file processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
    validate_strings(result, DOCX_TEST_STRINGS)
    validate_strings(result, XLSX_TEST_STRINGS)
    validate_strings(result, BLOG_TEST_STRINGS)
    # Test input from a stream
    input_data = b"<html><body><h1>Test</h1></body></html>"
    result = markitdown.convert_stream(io.BytesIO(input_data))
@ -441,7 +447,6 @@ def test_markitdown_llm() -> None:
    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
    for test_string in LLM_TEST_STRINGS:
        assert test_string in result.text_content
@ -450,6 +455,14 @@ def test_markitdown_llm() -> None:
    for test_string in ["red", "circle", "blue", "square"]:
        assert test_string in result.text_content.lower()
    # Images embedded in PPTX files
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
    # LLM Captions are included
    for test_string in LLM_TEST_STRINGS:
        assert test_string in result.text_content
    # Standard alt text is included
    validate_strings(result, PPTX_TEST_STRINGS)
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
@ -457,7 +470,7 @@ if __name__ == "__main__":
    test_stream_info_guesses()
    test_markitdown_remote()
    test_markitdown_local()
-    # test_exceptions()
+    test_exceptions()
-    # test_markitdown_exiftool()
+    test_markitdown_exiftool()
-    # test_markitdown_llm()
+    test_markitdown_llm()
    print("All tests passed!")