diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index 5919295..89506b1 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -41,7 +41,7 @@ class DocumentConverterResult: self.markdown = markdown def __str__(self) -> str: - """Return the Markdown content.""" + """Return the converted Markdown text.""" return self.markdown diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index db5d378..f1d0c1f 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -130,7 +130,7 @@ class MarkItDown: # Later registrations are tried first / take higher priority than earlier registrations # To this end, the most specific converters should appear below the most generic converters self.register_converter(PlainTextConverter()) - self.register_converter(ZipConverter()) + self.register_converter(ZipConverter(markitdown=self)) self.register_converter(HtmlConverter()) self.register_converter(RssConverter()) self.register_converter(WikipediaConverter()) @@ -464,16 +464,16 @@ class MarkItDown: # Attempt the conversion if _accepts: - # try: - res = converter.convert(file_stream, stream_info, **_kwargs) - # except Exception: - # failed_attempts.append( - # FailedConversionAttempt( - # converter=converter, exc_info=sys.exc_info() - # ) - # ) - # finally: - file_stream.seek(cur_pos) + try: + res = converter.convert(file_stream, stream_info, **_kwargs) + except Exception: + failed_attempts.append( + FailedConversionAttempt( + converter=converter, exc_info=sys.exc_info() + ) + ) + finally: + file_stream.seek(cur_pos) if res is not None: # Normalize the content diff --git a/packages/markitdown/src/markitdown/_stream_info.py b/packages/markitdown/src/markitdown/_stream_info.py index 7a30dcc..cf36875 100644 --- a/packages/markitdown/src/markitdown/_stream_info.py +++ b/packages/markitdown/src/markitdown/_stream_info.py @@ -1,4 +1,6 @@ import puremagic +import mimetypes +import os from dataclasses import dataclass, asdict from typing import Optional, BinaryIO, List, TypeVar, Type @@ -56,6 +58,18 @@ class StreamInfo: """ guesses: List[StreamInfo] = [] + # Add a guess purely based on the filename hint + if filename_hint: + try: + mimetype, _ = mimetypes.guess_file_type(filename_hint) + except AttributeError: + mimetype, _ = mimetypes.guess_type(filename_hint) + + if mimetype: + guesses.append( + cls(mimetype=mimetype, extension=os.path.splitext(filename_hint)[1]) + ) + def _puremagic( file_stream, filename_hint ) -> puremagic.main.PureMagicWithConfidence: diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 2adcdb9..2e9913c 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -41,7 +41,7 @@ class BingSerpConverter(DocumentConverter): Make sure we're dealing with HTML content *from* Bing. """ - url = (stream_info.url or "").lower() + url = stream_info.url or "" mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index c51124f..d0d7e66 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,4 +1,4 @@ -from typing import BinaryIO, Any +from typing import BinaryIO, Any, Union import base64 import mimetypes from ._exiftool import exiftool_metadata @@ -71,53 +71,73 @@ class ImageConverter(DocumentConverter): if f in metadata: md_content += f"{f}: {metadata[f]}\n" - # # Try describing the image with GPTV - # llm_client = kwargs.get("llm_client") - # llm_model = kwargs.get("llm_model") - # if llm_client is not None and llm_model is not None: - # md_content += ( - # "\n# Description:\n" - # + self._get_llm_description( - # local_path, - # extension, - # llm_client, - # llm_model, - # prompt=kwargs.get("llm_prompt"), - # ).strip() - # + "\n" - # ) + # Try describing the image with GPT + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: + md_content += ( + "\n# Description:\n" + + self._get_llm_description( + file_stream, + stream_info, + client=llm_client, + model=llm_model, + prompt=kwargs.get("llm_prompt"), + ).strip() + + "\n" + ) return DocumentConverterResult( markdown=md_content, ) + def _get_llm_description( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + *, + client, + model, + prompt=None, + ) -> Union[None, str]: + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." -# def _get_llm_description(self, local_path, extension, client, model, prompt=None): -# if prompt is None or prompt.strip() == "": -# prompt = "Write a detailed caption for this image." -# -# data_uri = "" -# with open(local_path, "rb") as image_file: -# content_type, encoding = mimetypes.guess_type("_dummy" + extension) -# if content_type is None: -# content_type = "image/jpeg" -# image_base64 = base64.b64encode(image_file.read()).decode("utf-8") -# data_uri = f"data:{content_type};base64,{image_base64}" -# -# messages = [ -# { -# "role": "user", -# "content": [ -# {"type": "text", "text": prompt}, -# { -# "type": "image_url", -# "image_url": { -# "url": data_uri, -# }, -# }, -# ], -# } -# ] -# -# response = client.chat.completions.create(model=model, messages=messages) -# return response.choices[0].message.content + # Get the content type + content_type = stream_info.mimetype + if not content_type: + content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension) + if not content_type: + content_type = "application/octet-stream" + + # Convert to base64 + cur_pos = file_stream.tell() + try: + base64_image = base64.b64encode(file_stream.read()).decode("utf-8") + except Exception as e: + return None + finally: + file_stream.seek(cur_pos) + + # Prepare the data-uri + data_uri = f"data:{content_type};base64,{base64_image}" + + # Prepare the OpenAI API request + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + # Call the OpenAI API + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content diff --git a/packages/markitdown/src/markitdown/converters/_llm_caption.py b/packages/markitdown/src/markitdown/converters/_llm_caption.py new file mode 100644 index 0000000..44756e0 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_llm_caption.py @@ -0,0 +1,50 @@ +from typing import BinaryIO, Any, Union +import base64 +import mimetypes +from .._stream_info import StreamInfo + + +def llm_caption( + file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None +) -> Union[None, str]: + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." + + # Get the content type + content_type = stream_info.mimetype + if not content_type: + content_type, _ = mimetypes.guess_type("_dummy" + stream_info.extension) + if not content_type: + content_type = "application/octet-stream" + + # Convert to base64 + cur_pos = file_stream.tell() + try: + base64_image = base64.b64encode(file_stream.read()).decode("utf-8") + except Exception as e: + return None + finally: + file_stream.seek(cur_pos) + + # Prepare the data-uri + data_uri = f"data:{content_type};base64,{base64_image}" + + # Prepare the OpenAI API request + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + # Call the OpenAI API + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 00228b5..48fc499 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,8 +1,13 @@ import sys -from typing import Union + +from typing import BinaryIO, Any + +from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None @@ -14,6 +19,14 @@ except ImportError: _dependency_exc_info = sys.exc_info() +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/pdf", + "application/x-pdf", +] + +ACCEPTED_FILE_EXTENSIONS = [".pdf"] + + class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. @@ -24,12 +37,30 @@ class PdfConverter(DocumentConverter): ): super().__init__(priority=priority) - def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: - # Bail if not a PDF - extension = kwargs.get("file_extension", "") - if extension.lower() != ".pdf": - return None + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( @@ -43,5 +74,5 @@ class PdfConverter(DocumentConverter): ) # Restore the original traceback return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(local_path) + markdown=pdfminer.high_level.extract_text(file_stream), ) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index a9f1902..92da511 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -1,13 +1,26 @@ -import mimetypes - -from charset_normalizer import from_path -from typing import Any, Union +import sys +from typing import BinaryIO, Any +from charset_normalizer import from_bytes from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import mammoth +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/", + "application/json", +] # Mimetypes to ignore (commonly confused extensions) -IGNORE_MIMETYPES = [ +IGNORE_MIME_TYPE_PREFIXES = [ "text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc. "text/vnd.graphviz", # .dot which is confused with xls, doc, etc. ] @@ -21,26 +34,34 @@ class PlainTextConverter(DocumentConverter): ): super().__init__(priority=priority) + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + for prefix in IGNORE_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return False + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Guess the content type from any file extension that might be around - content_type, _ = mimetypes.guess_type( - "__placeholder" + kwargs.get("file_extension", "") - ) + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + if stream_info.charset: + text_content = file_stream.read().decode(stream_info.charset) + else: + text_content = str(from_bytes(file_stream.read()).best()) - # Ignore common false positives - if content_type in IGNORE_MIMETYPES: - content_type = None - - # Only accept text files - if content_type is None: - return None - elif all( - not content_type.lower().startswith(type_prefix) - for type_prefix in ["text/", "application/json"] - ): - return None - - text_content = str(from_path(local_path).best()) return DocumentConverterResult(markdown=text_content) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index e250848..2cbd158 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -1,11 +1,14 @@ import sys import base64 +import os +import io import re import html from typing import BinaryIO, Any from ._html_converter import HtmlConverter +from ._llm_caption import llm_caption from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -38,35 +41,6 @@ class PptxConverter(DocumentConverter): super().__init__(priority=priority) self._html_converter = HtmlConverter() - def _get_llm_description( - self, llm_client, llm_model, image_blob, content_type, prompt=None - ): - if prompt is None or prompt.strip() == "": - prompt = "Write a detailed alt text for this image with less than 50 words." - - image_base64 = base64.b64encode(image_blob).decode("utf-8") - data_uri = f"data:{content_type};base64,{image_base64}" - - messages = [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": data_uri, - }, - }, - {"type": "text", "text": prompt}, - ], - } - ] - - response = llm_client.chat.completions.create( - model=llm_model, messages=messages - ) - return response.choices[0].message.content - def accepts( self, file_stream: BinaryIO, @@ -120,41 +94,54 @@ class PptxConverter(DocumentConverter): if self._is_picture(shape): # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 - llm_description = None - alt_text = None + llm_description = "" + alt_text = "" + # Potentially generate a description using an LLM llm_client = kwargs.get("llm_client") llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: + # Prepare a file_stream and stream_info for the image data + image_filename = shape.image.filename + image_extension = None + if image_filename: + image_extension = os.path.splitext(image_filename)[1] + image_stream_info = StreamInfo( + mimetype=shape.image.content_type, + extension=image_extension, + filename=image_filename, + ) + + image_stream = io.BytesIO(shape.image.blob) + + # Caption the image try: - llm_description = self._get_llm_description( - llm_client, - llm_model, - shape.image.blob, - shape.image.content_type, + llm_description = llm_caption( + image_stream, + image_stream_info, + client=llm_client, + model=llm_model, + prompt=kwargs.get("llm_prompt"), ) except Exception: - # Unable to describe with LLM + # Unable to generate a description pass - if not llm_description: - try: - alt_text = shape._element._nvXxPr.cNvPr.attrib.get( - "descr", "" - ) - except Exception: - # Unable to get alt text - pass + # Also grab any description embedded in the deck + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") + except Exception: + # Unable to get alt text + pass + + # Prepare the alt, escaping any special characters + alt_text = "\n".join([llm_description, alt_text]) or shape.name + alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) + alt_text = re.sub(r"\s+", " ", alt_text).strip() # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" - md_content += ( - "\n\n" - ) + md_content += "\n\n" # Tables if self._is_table(shape): diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 0eedaec..c9176f6 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -36,7 +36,7 @@ class WikipediaConverter(DocumentConverter): Make sure we're dealing with HTML content *from* Wikipedia. """ - url = (stream_info.url or "").lower() + url = stream_info.url or "" mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index 975d668..2efc6ea 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -1,14 +1,15 @@ -import re +import sys import json -import urllib.parse import time - -from typing import Any, Union, Dict, List -from urllib.parse import parse_qs, urlparse +import io +import re +from typing import Any, BinaryIO, Optional, Dict, List, Union +from urllib.parse import parse_qs, urlparse, unquote from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult - +from .._stream_info import StreamInfo +from ._markdownify import _CustomMarkdownify # Optional YouTube transcription support try: @@ -19,6 +20,17 @@ except ModuleNotFoundError: IS_YOUTUBE_TRANSCRIPT_CAPABLE = False +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + + class YouTubeConverter(DocumentConverter): """Handle YouTube specially, focusing on the video title, description, and transcript.""" @@ -27,45 +39,45 @@ class YouTubeConverter(DocumentConverter): ): super().__init__(priority=priority) - def retry_operation(self, operation, retries=3, delay=2): - """Retries the operation if it fails.""" - attempt = 0 - while attempt < retries: - try: - return operation() # Attempt the operation - except Exception as e: - print(f"Attempt {attempt + 1} failed: {e}") - if attempt < retries - 1: - time.sleep(delay) # Wait before retrying - attempt += 1 - # If all attempts fail, raise the last exception - raise Exception(f"Operation failed after {retries} attempts.") + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + """ + Make sure we're dealing with HTML content *from* YouTube. + """ + url = stream_info.url or "" + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not YouTube - extension = kwargs.get("file_extension", "") - if extension.lower() not in [".html", ".htm"]: - return None - url = kwargs.get("url", "") - - url = urllib.parse.unquote(url) + url = unquote(url) url = url.replace(r"\?", "?").replace(r"\=", "=") if not url.startswith("https://www.youtube.com/watch?"): - return None + # Not a YouTube URL + return False - # Parse the file with error handling - try: - with open(local_path, "rt", encoding="utf-8") as fh: - soup = BeautifulSoup(fh.read(), "html.parser") - except Exception as e: - print(f"Error reading YouTube page: {e}") - return None + if extension in ACCEPTED_FILE_EXTENSIONS: + return True - if not soup.title or not soup.title.string: - return None + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + # Not HTML content + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Parse the stream + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Read the meta tags metadata: Dict[str, str] = {"title": soup.title.string} @@ -126,7 +138,7 @@ class YouTubeConverter(DocumentConverter): if IS_YOUTUBE_TRANSCRIPT_CAPABLE: transcript_text = "" - parsed_url = urlparse(url) # type: ignore + parsed_url = urlparse(stream_info.url) # type: ignore params = parse_qs(parsed_url.query) # type: ignore if "v" in params and params["v"][0]: video_id = str(params["v"][0]) @@ -135,7 +147,7 @@ class YouTubeConverter(DocumentConverter): "youtube_transcript_languages", ("en",) ) # Retry the transcript fetching operation - transcript = self.retry_operation( + transcript = self._retry_operation( lambda: YouTubeTranscriptApi.get_transcript( video_id, languages=youtube_transcript_languages ), @@ -188,3 +200,17 @@ class YouTubeConverter(DocumentConverter): if result := self._findKey(v, key): return result return None + + def _retry_operation(self, operation, retries=3, delay=2): + """Retries the operation if it fails.""" + attempt = 0 + while attempt < retries: + try: + return operation() # Attempt the operation + except Exception as e: + print(f"Attempt {attempt + 1} failed: {e}") + if attempt < retries - 1: + time.sleep(delay) # Wait before retrying + attempt += 1 + # If all attempts fail, raise the last exception + raise Exception(f"Operation failed after {retries} attempts.") diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 7c8c6db..3717fb2 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -1,9 +1,19 @@ -import os +import sys import zipfile -import shutil -from typing import Any, Union +import io +import os + +from typing import BinaryIO, Any from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import UnsupportedFormatException, FileConversionException + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/zip", +] + +ACCEPTED_FILE_EXTENSIONS = [".zip"] class ZipConverter(DocumentConverter): @@ -46,95 +56,59 @@ class ZipConverter(DocumentConverter): """ def __init__( - self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + self, + priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT, + *, + markitdown: Any, ): super().__init__(priority=priority) + self._markitdown = markitdown + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - # Bail if not a ZIP - extension = kwargs.get("file_extension", "") - if extension.lower() != ".zip": - return None + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + file_path = stream_info.url or stream_info.local_path or stream_info.file_name + md_content = f"Content from the zip file `{file_path}`:\n\n" - # Get parent converters list if available - parent_converters = kwargs.get("_parent_converters", []) - if not parent_converters: - return DocumentConverterResult( - markdown=f"[ERROR] No converters available to process zip contents from: {local_path}", - ) + with zipfile.ZipFile(file_stream, "r") as zipObj: + for name in zipObj.namelist(): + try: + z_file_stream = io.BytesIO(zipObj.read(name)) + z_file_stream_info = StreamInfo( + extension=os.path.splitext(name)[1], + filename=os.path.basename(name), + ) + result = self._markitdown.convert_stream( + stream=z_file_stream, + stream_info=z_file_stream_info, + ) + if result is not None: + md_content += f"## File: {name}\n\n" + md_content += result.markdown + "\n\n" + except UnsupportedFormatException: + pass + except FileConversionException: + pass - extracted_zip_folder_name = ( - f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" - ) - extraction_dir = os.path.normpath( - os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) - ) - md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" - - try: - # Extract the zip file safely - with zipfile.ZipFile(local_path, "r") as zipObj: - # Bail if we discover it's an Office OOXML file - if "[Content_Types].xml" in zipObj.namelist(): - return None - - # Safeguard against path traversal - for member in zipObj.namelist(): - member_path = os.path.normpath(os.path.join(extraction_dir, member)) - if ( - not os.path.commonprefix([extraction_dir, member_path]) - == extraction_dir - ): - raise ValueError( - f"Path traversal detected in zip file: {member}" - ) - - # Extract all files safely - zipObj.extractall(path=extraction_dir) - - # Process each extracted file - for root, dirs, files in os.walk(extraction_dir): - for name in files: - file_path = os.path.join(root, name) - relative_path = os.path.relpath(file_path, extraction_dir) - - # Get file extension - _, file_extension = os.path.splitext(name) - - # Update kwargs for the file - file_kwargs = kwargs.copy() - file_kwargs["file_extension"] = file_extension - file_kwargs["_parent_converters"] = parent_converters - - # Try converting the file using available converters - for converter in parent_converters: - # Skip the zip converter to avoid infinite recursion - if isinstance(converter, ZipConverter): - continue - - result = converter.convert(file_path, **file_kwargs) - if result is not None: - md_content += f"\n## File: {relative_path}\n\n" - md_content += result.markdown + "\n\n" - break - - # Clean up extracted files if specified - if kwargs.get("cleanup_extracted", True): - shutil.rmtree(extraction_dir) - - return DocumentConverterResult(markdown=md_content.strip()) - - except zipfile.BadZipFile: - return DocumentConverterResult( - markdown=f"[ERROR] Invalid or corrupted zip file: {local_path}", - ) - except ValueError as ve: - return DocumentConverterResult( - markdown=f"[ERROR] Security error in zip file {local_path}: {str(ve)}", - ) - except Exception as e: - return DocumentConverterResult( - markdown=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", - ) + return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/tests/test_files/test.pdf b/packages/markitdown/tests/test_files/test.pdf new file mode 100644 index 0000000..e82861e Binary files /dev/null and b/packages/markitdown/tests/test_files/test.pdf differ diff --git a/packages/markitdown/tests/test_files/test.pptx b/packages/markitdown/tests/test_files/test.pptx index e6d16f3..fb66302 100644 Binary files a/packages/markitdown/tests/test_files/test.pptx and b/packages/markitdown/tests/test_files/test.pptx differ diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index 61c9ff7..b44ba00 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -2,6 +2,7 @@ import io import os import shutil +import openai import pytest import requests @@ -289,7 +290,6 @@ def test_markitdown_remote() -> None: assert test_string in result.text_content # Youtube - # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue result = markitdown.convert(YOUTUBE_TEST_URL) for test_string in YOUTUBE_TEST_STRINGS: assert test_string in result.text_content @@ -298,6 +298,10 @@ def test_markitdown_remote() -> None: def test_markitdown_local() -> None: markitdown = MarkItDown() + # Test PDF processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pdf")) + validate_strings(result, PDF_TEST_STRINGS) + # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) validate_strings(result, XLSX_TEST_STRINGS) @@ -336,10 +340,6 @@ def test_markitdown_local() -> None: ) validate_strings(result, BLOG_TEST_STRINGS) - # Test ZIP file processing - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) - validate_strings(result, XLSX_TEST_STRINGS) - # Test Wikipedia processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL @@ -360,18 +360,24 @@ def test_markitdown_local() -> None: for test_string in RSS_TEST_STRINGS: assert test_string in text_content - ## Test non-UTF-8 encoding - result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) - validate_strings(result, CSV_CP932_TEST_STRINGS) - # Test MSG (Outlook email) processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg")) validate_strings(result, MSG_TEST_STRINGS) + # Test non-UTF-8 encoding + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) + validate_strings(result, CSV_CP932_TEST_STRINGS) + # Test JSON processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json")) validate_strings(result, JSON_TEST_STRINGS) + # # Test ZIP file processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) + validate_strings(result, DOCX_TEST_STRINGS) + validate_strings(result, XLSX_TEST_STRINGS) + validate_strings(result, BLOG_TEST_STRINGS) + # Test input from a stream input_data = b"