Initial work updating signatures.

2025-03-03 13:16:15 -08:00 · 2025-03-03 13:16:15 -08:00 · e43632b048
commit e43632b048
parent 1d2f231146
22 changed files with 180 additions and 85 deletions
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@ -4,6 +4,7 @@
 from .__about__ import __version__
 from ._markitdown import MarkItDown
 from ._base_converter import DocumentConverterResult, BaseDocumentConverter
 from ._exceptions import (
    MarkItDownException,
    MissingDependencyException,
@ -11,12 +12,13 @@ from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
 )
-from .converters import DocumentConverter, DocumentConverterResult
+from .converters import DocumentConverter
 __all__ = [
    "__version__",
    "MarkItDown",
    "DocumentConverter",
    "BaseDocumentConverter",
    "DocumentConverterResult",
    "MarkItDownException",
    "MissingDependencyException",
--- a/packages/markitdown/src/markitdown/_base_converter.py
+++ b/packages/markitdown/src/markitdown/_base_converter.py
@ -0,0 +1,116 @@
 from typing import Any, Union, BinaryIO, Optional
 class DocumentConverterResult:
    """The result of converting a document to Markdown."""
    def __init__(
        self,
        markdown: str,
        *,
        title: Optional[str] = None,
    ):
        """
        Initialize the DocumentConverterResult.
        Parameters:
        - markdown: The converted Markdown text.
        - title: Optional title of the document.
        """
        self.markdown = markdown
        self.title = title
    @property
    def text_content(self) -> str:
        """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
        return self.markdown
    @text_content.setter
    def text_content(self, markdown: str):
        """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
        self.markdown = markdown
    def __str__(self) -> str:
        """Return the Markdown content."""
        return self.markdown
 class BaseDocumentConverter:
    """Abstract superclass of all DocumentConverters."""
    # Lower priority values are tried first.
    PRIORITY_SPECIFIC_FILE_FORMAT = (
        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
    )
    PRIORITY_GENERIC_FILE_FORMAT = (
        10.0  # Near catch-all converters for mimetypes like text/*, etc.
    )
    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
        """
        Initialize the DocumentConverter with a given priority.
        Priorities work as follows: By default, most converters get priority
        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
        with lower values being tried first (i.e., higher priority).
        Just prior to conversion, the converters are sorted by priority, using
        a stable sort. This means that converters with the same priority will
        remain in the same order, with the most recently registered converters
        appearing first.
        We have tight control over the order of built-in converters, but
        plugins can register converters in any order. A converter's priority
        field reasserts some control over the order of converters.
        Plugins can register converters with any priority, to appear before or
        after the built-ins. For example, a plugin with priority 9 will run
        before the PlainTextConverter, but after the built-in converters.
        """
        self._priority = priority
    def convert(
        self,
        file_stream,
        *,
        mime_type: str = "application/octet-stream",
        file_extension: Optional[str] = None,
        charset: Optional[str] = None,
        **kwargs: Any,
    ) -> Union[None, DocumentConverterResult]:
        """
        Convert a document to Markdown text, or return None if the converter
        cannot handle the document (causing the next converter to be tried).
        The determination of whether a converter can handle a document is primarily based on
        the provided MIME type. The file extension can serve as a secondary check if the
        MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the
        chatset is used to determine the encoding of the file content in cases of text/*
        Prameters:
        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
        - mime_type: The MIME type of the file. Default is "application/octet-stream".
        - file_extension: The file extension of the file. Default is None.
        - charset: The character set of the file. Default is None.
        - kwargs: Additional keyword arguments for the converter.
        Returns:
        - DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
        or
        - None: If the converter cannot handle the document.
        Raises:
        - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
        - MissingDependencyException: If the converter requires a dependency that is not installed.
        """
        raise NotImplementedError("Subclasses must implement this method")
    @property
    def priority(self) -> float:
        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
        return self._priority
    @priority.setter
    def priority(self, value: float):
        self._priority = value
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@ -18,7 +18,6 @@ import requests
 from .converters import (
    DocumentConverter,
    DocumentConverterResult,
    PlainTextConverter,
    HtmlConverter,
    RssConverter,
@ -39,6 +38,8 @@ from .converters import (
    DocumentIntelligenceConverter,
 )
 from ._base_converter import DocumentConverterResult
 from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: MIT
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from ._plain_text_converter import PlainTextConverter
 from ._html_converter import HtmlConverter
 from ._rss_converter import RssConverter
@ -23,7 +23,6 @@ from ._doc_intel_converter import DocumentIntelligenceConverter
 __all__ = [
    "DocumentConverter",
    "DocumentConverterResult",
    "PlainTextConverter",
    "HtmlConverter",
    "RssConverter",
--- a/packages/markitdown/src/markitdown/converters/_base.py
+++ b/packages/markitdown/src/markitdown/converters/_base.py
@ -1,12 +1,5 @@
 from typing import Any, Union
-
+from .._base_converter import DocumentConverterResult
 class DocumentConverterResult:
    """The result of converting a document to text."""
    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
        self.title: Union[str, None] = title
        self.text_content: str = text_content
 class DocumentConverter:
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@ -6,7 +6,8 @@ from typing import Union
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
@ -81,6 +82,6 @@ class BingSerpConverter(DocumentConverter):
        )
        return DocumentConverterResult(
            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
            text_content=webpage_text,
        )
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@ -2,7 +2,8 @@ from typing import Any, Union
 import re
 import sys
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException
 # Try loading optional (but in this case, required) dependencies
@ -103,7 +104,4 @@ class DocumentIntelligenceConverter(DocumentConverter):
        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
-        return DocumentConverterResult(
+        return DocumentConverterResult(markdown=markdown_text)
            title=None,
            text_content=markdown_text,
        )
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@ -2,11 +2,8 @@ import sys
 from typing import Union
 from ._base import (
    DocumentConverterResult,
 )
 from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@ -1,7 +1,8 @@
 from typing import Any, Union
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
@ -51,6 +52,6 @@ class HtmlConverter(DocumentConverter):
        webpage_text = webpage_text.strip()
        return DocumentConverterResult(
            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
            text_content=webpage_text,
        )
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@ -1,5 +1,6 @@
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._media_converter import MediaConverter
 import base64
 import mimetypes
@ -59,8 +60,7 @@ class ImageConverter(MediaConverter):
            )
        return DocumentConverterResult(
-            title=None,
+            markdown=md_content,
            text_content=md_content,
        )
    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@ -1,10 +1,8 @@
 import json
 from typing import Any, Union
-from ._base import (
+from ._base import DocumentConverter
-    DocumentConverter,
+from .._base_converter import DocumentConverterResult
    DocumentConverterResult,
 )
 from .._exceptions import FileConversionException
@ -65,8 +63,8 @@ class IpynbConverter(DocumentConverter):
            title = notebook_content.get("metadata", {}).get("title", title)
            return DocumentConverterResult(
                markdown=md_text,
                title=title,
                text_content=md_text,
            )
        except Exception as e:
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@ -1,6 +1,7 @@
 import tempfile
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings
@ -83,7 +84,4 @@ class Mp3Converter(WavConverter):
                os.unlink(temp_path)
        # Return the result
-        return DocumentConverterResult(
+        return DocumentConverterResult(markdown=md_content.strip())
            title=None,
            text_content=md_content.strip(),
        )
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@ -1,6 +1,7 @@
 import sys
 from typing import Any, Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
@ -73,7 +74,8 @@ class OutlookMsgConverter(DocumentConverter):
            msg.close()
            return DocumentConverterResult(
-                title=headers.get("Subject"), text_content=md_content.strip()
+                markdown=md_content.strip(),
                title=headers.get("Subject"),
            )
        except Exception as e:
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@ -1,6 +1,7 @@
 import sys
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
@ -43,6 +44,5 @@ class PdfConverter(DocumentConverter):
            )  # Restore the original traceback
        return DocumentConverterResult(
-            title=None,
+            markdown=pdfminer.high_level.extract_text(local_path)
            text_content=pdfminer.high_level.extract_text(local_path),
        )
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@ -3,7 +3,8 @@ import mimetypes
 from charset_normalizer import from_path
 from typing import Any, Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 # Mimetypes to ignore (commonly confused extensions)
@ -43,7 +44,4 @@ class PlainTextConverter(DocumentConverter):
            return None
        text_content = str(from_path(local_path).best())
-        return DocumentConverterResult(
+        return DocumentConverterResult(markdown=text_content)
            title=None,
            text_content=text_content,
        )
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@ -5,7 +5,8 @@ import sys
 from typing import Union
-from ._base import DocumentConverterResult, DocumentConverter
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter):
                    md_content += notes_frame.text
                md_content = md_content.strip()
-        return DocumentConverterResult(
+        return DocumentConverterResult(markdown=md_content.strip())
            title=None,
            text_content=md_content.strip(),
        )
    def _is_picture(self, shape):
        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@ -3,7 +3,8 @@ from typing import Union
 from bs4 import BeautifulSoup
 from ._markdownify import _CustomMarkdownify
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 class RssConverter(DocumentConverter):
@ -73,8 +74,8 @@ class RssConverter(DocumentConverter):
                    md_text += self._parse_content(entry_content)
            return DocumentConverterResult(
                markdown=md_text,
                title=title,
                text_content=md_text,
            )
        except BaseException as _:
            return None
@ -117,8 +118,8 @@ class RssConverter(DocumentConverter):
                    md_text += self._parse_content(content)
            return DocumentConverterResult(
                markdown=md_text,
                title=channel_title,
                text_content=md_text,
            )
        except BaseException as _:
            print(traceback.format_exc())
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@ -1,5 +1,6 @@
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._media_converter import MediaConverter
 # Optional Transcription support
@ -60,10 +61,7 @@ class WavConverter(MediaConverter):
                    "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
                )
-        return DocumentConverterResult(
+        return DocumentConverterResult(markdown=md_content.strip())
            title=None,
            text_content=md_content.strip(),
        )
    def _transcribe_audio(self, local_path) -> str:
        recognizer = sr.Recognizer()
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@ -3,7 +3,8 @@ import re
 from typing import Any, Union
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
@ -56,6 +57,6 @@ class WikipediaConverter(DocumentConverter):
            webpage_text = _CustomMarkdownify().convert_soup(soup)
        return DocumentConverterResult(
            markdown=webpage_text,
            title=main_title,
            text_content=webpage_text,
        )
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@ -2,7 +2,8 @@ import sys
 from typing import Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 from ._html_converter import HtmlConverter
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -58,10 +59,7 @@ class XlsxConverter(HtmlConverter):
            html_content = sheets[s].to_html(index=False)
            md_content += self._convert(html_content).text_content.strip() + "\n\n"
-        return DocumentConverterResult(
+        return DocumentConverterResult(markdown=md_content.strip())
            title=None,
            text_content=md_content.strip(),
        )
 class XlsConverter(HtmlConverter):
@ -94,7 +92,4 @@ class XlsConverter(HtmlConverter):
            html_content = sheets[s].to_html(index=False)
            md_content += self._convert(html_content).text_content.strip() + "\n\n"
-        return DocumentConverterResult(
+        return DocumentConverterResult(markdown=md_content.strip())
            title=None,
            text_content=md_content.strip(),
        )
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -7,7 +7,8 @@ from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 # Optional YouTube transcription support
@ -158,8 +159,8 @@ class YouTubeConverter(DocumentConverter):
        assert isinstance(title, str)
        return DocumentConverterResult(
            markdown=webpage_text,
            title=title,
            text_content=webpage_text,
        )
    def _get(
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@ -3,7 +3,8 @@ import zipfile
 import shutil
 from typing import Any, Union
-from ._base import DocumentConverter, DocumentConverterResult
+from ._base import DocumentConverter
 from .._base_converter import DocumentConverterResult
 class ZipConverter(DocumentConverter):
@ -62,8 +63,7 @@ class ZipConverter(DocumentConverter):
        parent_converters = kwargs.get("_parent_converters", [])
        if not parent_converters:
            return DocumentConverterResult(
-                title=None,
+                markdown=f"[ERROR] No converters available to process zip contents from: {local_path}",
                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
            )
        extracted_zip_folder_name = (
@ -118,27 +118,24 @@ class ZipConverter(DocumentConverter):
                        result = converter.convert(file_path, **file_kwargs)
                        if result is not None:
                            md_content += f"\n## File: {relative_path}\n\n"
-                            md_content += result.text_content + "\n\n"
+                            md_content += result.markdown + "\n\n"
                            break
            # Clean up extracted files if specified
            if kwargs.get("cleanup_extracted", True):
                shutil.rmtree(extraction_dir)
-            return DocumentConverterResult(title=None, text_content=md_content.strip())
+            return DocumentConverterResult(markdown=md_content.strip())
        except zipfile.BadZipFile:
            return DocumentConverterResult(
-                title=None,
+                markdown=f"[ERROR] Invalid or corrupted zip file: {local_path}",
                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
            )
        except ValueError as ve:
            return DocumentConverterResult(
-                title=None,
+                markdown=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
            )
        except Exception as e:
            return DocumentConverterResult(
-                title=None,
+                markdown=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
            )