From e43632b048fba06c2b6672089b9aa1ac86db0848 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Mon, 3 Mar 2025 13:16:15 -0800 Subject: [PATCH] Initial work updating signatures. --- .../markitdown/src/markitdown/__init__.py | 4 +- .../src/markitdown/_base_converter.py | 116 ++++++++++++++++++ .../markitdown/src/markitdown/_markitdown.py | 3 +- .../src/markitdown/converters/__init__.py | 3 +- .../src/markitdown/converters/_base.py | 9 +- .../converters/_bing_serp_converter.py | 5 +- .../converters/_doc_intel_converter.py | 8 +- .../markitdown/converters/_docx_converter.py | 5 +- .../markitdown/converters/_html_converter.py | 5 +- .../markitdown/converters/_image_converter.py | 6 +- .../markitdown/converters/_ipynb_converter.py | 8 +- .../markitdown/converters/_mp3_converter.py | 8 +- .../converters/_outlook_msg_converter.py | 6 +- .../markitdown/converters/_pdf_converter.py | 6 +- .../converters/_plain_text_converter.py | 8 +- .../markitdown/converters/_pptx_converter.py | 8 +- .../markitdown/converters/_rss_converter.py | 7 +- .../markitdown/converters/_wav_converter.py | 8 +- .../converters/_wikipedia_converter.py | 5 +- .../markitdown/converters/_xlsx_converter.py | 13 +- .../converters/_youtube_converter.py | 5 +- .../markitdown/converters/_zip_converter.py | 19 ++- 22 files changed, 180 insertions(+), 85 deletions(-) create mode 100644 packages/markitdown/src/markitdown/_base_converter.py diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index 9f7db16..620e2b0 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -4,6 +4,7 @@ from .__about__ import __version__ from ._markitdown import MarkItDown +from ._base_converter import DocumentConverterResult, BaseDocumentConverter from ._exceptions import ( MarkItDownException, MissingDependencyException, @@ -11,12 +12,13 @@ from ._exceptions import ( FileConversionException, UnsupportedFormatException, ) -from .converters import DocumentConverter, DocumentConverterResult +from .converters import DocumentConverter __all__ = [ "__version__", "MarkItDown", "DocumentConverter", + "BaseDocumentConverter", "DocumentConverterResult", "MarkItDownException", "MissingDependencyException", diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py new file mode 100644 index 0000000..470ff74 --- /dev/null +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -0,0 +1,116 @@ +from typing import Any, Union, BinaryIO, Optional + + +class DocumentConverterResult: + """The result of converting a document to Markdown.""" + + def __init__( + self, + markdown: str, + *, + title: Optional[str] = None, + ): + """ + Initialize the DocumentConverterResult. + + Parameters: + - markdown: The converted Markdown text. + - title: Optional title of the document. + """ + self.markdown = markdown + self.title = title + + @property + def text_content(self) -> str: + """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" + return self.markdown + + @text_content.setter + def text_content(self, markdown: str): + """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" + self.markdown = markdown + + def __str__(self) -> str: + """Return the Markdown content.""" + return self.markdown + + +class BaseDocumentConverter: + """Abstract superclass of all DocumentConverters.""" + + # Lower priority values are tried first. + PRIORITY_SPECIFIC_FILE_FORMAT = ( + 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia + ) + PRIORITY_GENERIC_FILE_FORMAT = ( + 10.0 # Near catch-all converters for mimetypes like text/*, etc. + ) + + def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): + """ + Initialize the DocumentConverter with a given priority. + + Priorities work as follows: By default, most converters get priority + DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception + is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), + with lower values being tried first (i.e., higher priority). + + Just prior to conversion, the converters are sorted by priority, using + a stable sort. This means that converters with the same priority will + remain in the same order, with the most recently registered converters + appearing first. + + We have tight control over the order of built-in converters, but + plugins can register converters in any order. A converter's priority + field reasserts some control over the order of converters. + + Plugins can register converters with any priority, to appear before or + after the built-ins. For example, a plugin with priority 9 will run + before the PlainTextConverter, but after the built-in converters. + """ + self._priority = priority + + def convert( + self, + file_stream, + *, + mime_type: str = "application/octet-stream", + file_extension: Optional[str] = None, + charset: Optional[str] = None, + **kwargs: Any, + ) -> Union[None, DocumentConverterResult]: + """ + Convert a document to Markdown text, or return None if the converter + cannot handle the document (causing the next converter to be tried). + + The determination of whether a converter can handle a document is primarily based on + the provided MIME type. The file extension can serve as a secondary check if the + MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the + chatset is used to determine the encoding of the file content in cases of text/* + + Prameters: + - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. + - mime_type: The MIME type of the file. Default is "application/octet-stream". + - file_extension: The file extension of the file. Default is None. + - charset: The character set of the file. Default is None. + - kwargs: Additional keyword arguments for the converter. + + Returns: + - DocumentConverterResult: The result of the conversion, which includes the title and markdown content. + or + - None: If the converter cannot handle the document. + + Raises: + - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. + - MissingDependencyException: If the converter requires a dependency that is not installed. + """ + raise NotImplementedError("Subclasses must implement this method") + + @property + def priority(self) -> float: + """Priority of the converter in markitdown's converter list. Higher priority values are tried first.""" + return self._priority + + @priority.setter + def priority(self, value: float): + self._priority = value diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 8f1bd46..50b64b4 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -18,7 +18,6 @@ import requests from .converters import ( DocumentConverter, - DocumentConverterResult, PlainTextConverter, HtmlConverter, RssConverter, @@ -39,6 +38,8 @@ from .converters import ( DocumentIntelligenceConverter, ) +from ._base_converter import DocumentConverterResult + from ._exceptions import ( FileConversionException, UnsupportedFormatException, diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 1e5afe4..996b78b 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: MIT -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter from ._plain_text_converter import PlainTextConverter from ._html_converter import HtmlConverter from ._rss_converter import RssConverter @@ -23,7 +23,6 @@ from ._doc_intel_converter import DocumentIntelligenceConverter __all__ = [ "DocumentConverter", - "DocumentConverterResult", "PlainTextConverter", "HtmlConverter", "RssConverter", diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py index 0f351fc..e1a544a 100644 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ b/packages/markitdown/src/markitdown/converters/_base.py @@ -1,12 +1,5 @@ from typing import Any, Union - - -class DocumentConverterResult: - """The result of converting a document to text.""" - - def __init__(self, title: Union[str, None] = None, text_content: str = ""): - self.title: Union[str, None] = title - self.text_content: str = text_content +from .._base_converter import DocumentConverterResult class DocumentConverter: diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index d1b11a6..bdb15bf 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -6,7 +6,8 @@ from typing import Union from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._markdownify import _CustomMarkdownify @@ -81,6 +82,6 @@ class BingSerpConverter(DocumentConverter): ) return DocumentConverterResult( + markdown=webpage_text, title=None if soup.title is None else soup.title.string, - text_content=webpage_text, ) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 6fe79c0..1ad8981 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -2,7 +2,8 @@ from typing import Any, Union import re import sys -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from .._exceptions import MissingDependencyException # Try loading optional (but in this case, required) dependencies @@ -103,7 +104,4 @@ class DocumentIntelligenceConverter(DocumentConverter): # remove comments from the markdown content generated by Doc Intelligence and append to markdown string markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) - return DocumentConverterResult( - title=None, - text_content=markdown_text, - ) + return DocumentConverterResult(markdown=markdown_text) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 0866e59..ea2550b 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -2,11 +2,8 @@ import sys from typing import Union -from ._base import ( - DocumentConverterResult, -) - from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._html_converter import HtmlConverter from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 68c2536..64efb9a 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -1,7 +1,8 @@ from typing import Any, Union from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._markdownify import _CustomMarkdownify @@ -51,6 +52,6 @@ class HtmlConverter(DocumentConverter): webpage_text = webpage_text.strip() return DocumentConverterResult( + markdown=webpage_text, title=None if soup.title is None else soup.title.string, - text_content=webpage_text, ) diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index 4eb6155..5923103 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,5 +1,6 @@ from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._media_converter import MediaConverter import base64 import mimetypes @@ -59,8 +60,7 @@ class ImageConverter(MediaConverter): ) return DocumentConverterResult( - title=None, - text_content=md_content, + markdown=md_content, ) def _get_llm_description(self, local_path, extension, client, model, prompt=None): diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index b487f41..cc40d4e 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -1,10 +1,8 @@ import json from typing import Any, Union -from ._base import ( - DocumentConverter, - DocumentConverterResult, -) +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from .._exceptions import FileConversionException @@ -65,8 +63,8 @@ class IpynbConverter(DocumentConverter): title = notebook_content.get("metadata", {}).get("title", title) return DocumentConverterResult( + markdown=md_text, title=title, - text_content=md_text, ) except Exception as e: diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py index 91fd270..a2276b6 100644 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py @@ -1,6 +1,7 @@ import tempfile from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._wav_converter import WavConverter from warnings import resetwarnings, catch_warnings @@ -83,7 +84,4 @@ class Mp3Converter(WavConverter): os.unlink(temp_path) # Return the result - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index eb7a065..4abc860 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -1,6 +1,7 @@ import sys from typing import Any, Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -73,7 +74,8 @@ class OutlookMsgConverter(DocumentConverter): msg.close() return DocumentConverterResult( - title=headers.get("Subject"), text_content=md_content.strip() + markdown=md_content.strip(), + title=headers.get("Subject"), ) except Exception as e: diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 3c5ecad..2767954 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,6 +1,7 @@ import sys from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies @@ -43,6 +44,5 @@ class PdfConverter(DocumentConverter): ) # Restore the original traceback return DocumentConverterResult( - title=None, - text_content=pdfminer.high_level.extract_text(local_path), + markdown=pdfminer.high_level.extract_text(local_path) ) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index b4c9282..5905851 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -3,7 +3,8 @@ import mimetypes from charset_normalizer import from_path from typing import Any, Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult # Mimetypes to ignore (commonly confused extensions) @@ -43,7 +44,4 @@ class PlainTextConverter(DocumentConverter): return None text_content = str(from_path(local_path).best()) - return DocumentConverterResult( - title=None, - text_content=text_content, - ) + return DocumentConverterResult(markdown=text_content) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 431b6a0..99e4337 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -5,7 +5,8 @@ import sys from typing import Union -from ._base import DocumentConverterResult, DocumentConverter +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._html_converter import HtmlConverter from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter): md_content += notes_frame.text md_content = md_content.strip() - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) def _is_picture(self, shape): if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index b279c85..2471799 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -3,7 +3,8 @@ from typing import Union from bs4 import BeautifulSoup from ._markdownify import _CustomMarkdownify -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult class RssConverter(DocumentConverter): @@ -73,8 +74,8 @@ class RssConverter(DocumentConverter): md_text += self._parse_content(entry_content) return DocumentConverterResult( + markdown=md_text, title=title, - text_content=md_text, ) except BaseException as _: return None @@ -117,8 +118,8 @@ class RssConverter(DocumentConverter): md_text += self._parse_content(content) return DocumentConverterResult( + markdown=md_text, title=channel_title, - text_content=md_text, ) except BaseException as _: print(traceback.format_exc()) diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py index 3c8d842..4278f6f 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py @@ -1,5 +1,6 @@ from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._media_converter import MediaConverter # Optional Transcription support @@ -60,10 +61,7 @@ class WavConverter(MediaConverter): "\n\n### Audio Transcript:\nError. Could not transcribe this audio." ) - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) def _transcribe_audio(self, local_path) -> str: recognizer = sr.Recognizer() diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index f27fe23..b4665c0 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -3,7 +3,8 @@ import re from typing import Any, Union from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._markdownify import _CustomMarkdownify @@ -56,6 +57,6 @@ class WikipediaConverter(DocumentConverter): webpage_text = _CustomMarkdownify().convert_soup(soup) return DocumentConverterResult( + markdown=webpage_text, title=main_title, - text_content=webpage_text, ) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 56398ca..7257768 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -2,7 +2,8 @@ import sys from typing import Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult from ._html_converter import HtmlConverter from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -58,10 +59,7 @@ class XlsxConverter(HtmlConverter): html_content = sheets[s].to_html(index=False) md_content += self._convert(html_content).text_content.strip() + "\n\n" - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) class XlsConverter(HtmlConverter): @@ -94,7 +92,4 @@ class XlsConverter(HtmlConverter): html_content = sheets[s].to_html(index=False) md_content += self._convert(html_content).text_content.strip() + "\n\n" - return DocumentConverterResult( - title=None, - text_content=md_content.strip(), - ) + return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index e61b208..485b095 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -7,7 +7,8 @@ from typing import Any, Union, Dict, List from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult # Optional YouTube transcription support @@ -158,8 +159,8 @@ class YouTubeConverter(DocumentConverter): assert isinstance(title, str) return DocumentConverterResult( + markdown=webpage_text, title=title, - text_content=webpage_text, ) def _get( diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index e2b5fe6..d8f2951 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -3,7 +3,8 @@ import zipfile import shutil from typing import Any, Union -from ._base import DocumentConverter, DocumentConverterResult +from ._base import DocumentConverter +from .._base_converter import DocumentConverterResult class ZipConverter(DocumentConverter): @@ -62,8 +63,7 @@ class ZipConverter(DocumentConverter): parent_converters = kwargs.get("_parent_converters", []) if not parent_converters: return DocumentConverterResult( - title=None, - text_content=f"[ERROR] No converters available to process zip contents from: {local_path}", + markdown=f"[ERROR] No converters available to process zip contents from: {local_path}", ) extracted_zip_folder_name = ( @@ -118,27 +118,24 @@ class ZipConverter(DocumentConverter): result = converter.convert(file_path, **file_kwargs) if result is not None: md_content += f"\n## File: {relative_path}\n\n" - md_content += result.text_content + "\n\n" + md_content += result.markdown + "\n\n" break # Clean up extracted files if specified if kwargs.get("cleanup_extracted", True): shutil.rmtree(extraction_dir) - return DocumentConverterResult(title=None, text_content=md_content.strip()) + return DocumentConverterResult(markdown=md_content.strip()) except zipfile.BadZipFile: return DocumentConverterResult( - title=None, - text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", + markdown=f"[ERROR] Invalid or corrupted zip file: {local_path}", ) except ValueError as ve: return DocumentConverterResult( - title=None, - text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}", + markdown=f"[ERROR] Security error in zip file {local_path}: {str(ve)}", ) except Exception as e: return DocumentConverterResult( - title=None, - text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", + markdown=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", )