Initial work updating signatures.

This commit is contained in:
Adam Fourney 2025-03-03 13:16:15 -08:00
parent 1d2f231146
commit e43632b048
22 changed files with 180 additions and 85 deletions

View file

@ -4,6 +4,7 @@
from .__about__ import __version__
from ._markitdown import MarkItDown
from ._base_converter import DocumentConverterResult, BaseDocumentConverter
from ._exceptions import (
MarkItDownException,
MissingDependencyException,
@ -11,12 +12,13 @@ from ._exceptions import (
FileConversionException,
UnsupportedFormatException,
)
from .converters import DocumentConverter, DocumentConverterResult
from .converters import DocumentConverter
__all__ = [
"__version__",
"MarkItDown",
"DocumentConverter",
"BaseDocumentConverter",
"DocumentConverterResult",
"MarkItDownException",
"MissingDependencyException",

View file

@ -0,0 +1,116 @@
from typing import Any, Union, BinaryIO, Optional
class DocumentConverterResult:
"""The result of converting a document to Markdown."""
def __init__(
self,
markdown: str,
*,
title: Optional[str] = None,
):
"""
Initialize the DocumentConverterResult.
Parameters:
- markdown: The converted Markdown text.
- title: Optional title of the document.
"""
self.markdown = markdown
self.title = title
@property
def text_content(self) -> str:
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
return self.markdown
@text_content.setter
def text_content(self, markdown: str):
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
self.markdown = markdown
def __str__(self) -> str:
"""Return the Markdown content."""
return self.markdown
class BaseDocumentConverter:
"""Abstract superclass of all DocumentConverters."""
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
"""
Initialize the DocumentConverter with a given priority.
Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
with lower values being tried first (i.e., higher priority).
Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.
We have tight control over the order of built-in converters, but
plugins can register converters in any order. A converter's priority
field reasserts some control over the order of converters.
Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._priority = priority
def convert(
self,
file_stream,
*,
mime_type: str = "application/octet-stream",
file_extension: Optional[str] = None,
charset: Optional[str] = None,
**kwargs: Any,
) -> Union[None, DocumentConverterResult]:
"""
Convert a document to Markdown text, or return None if the converter
cannot handle the document (causing the next converter to be tried).
The determination of whether a converter can handle a document is primarily based on
the provided MIME type. The file extension can serve as a secondary check if the
MIME type is not sufficiently specific (e.g., application/octet-stream). Finally, the
chatset is used to determine the encoding of the file content in cases of text/*
Prameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
- mime_type: The MIME type of the file. Default is "application/octet-stream".
- file_extension: The file extension of the file. Default is None.
- charset: The character set of the file. Default is None.
- kwargs: Additional keyword arguments for the converter.
Returns:
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
or
- None: If the converter cannot handle the document.
Raises:
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
- MissingDependencyException: If the converter requires a dependency that is not installed.
"""
raise NotImplementedError("Subclasses must implement this method")
@property
def priority(self) -> float:
"""Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
return self._priority
@priority.setter
def priority(self, value: float):
self._priority = value

View file

@ -18,7 +18,6 @@ import requests
from .converters import (
DocumentConverter,
DocumentConverterResult,
PlainTextConverter,
HtmlConverter,
RssConverter,
@ -39,6 +38,8 @@ from .converters import (
DocumentIntelligenceConverter,
)
from ._base_converter import DocumentConverterResult
from ._exceptions import (
FileConversionException,
UnsupportedFormatException,

View file

@ -2,7 +2,7 @@
#
# SPDX-License-Identifier: MIT
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from ._plain_text_converter import PlainTextConverter
from ._html_converter import HtmlConverter
from ._rss_converter import RssConverter
@ -23,7 +23,6 @@ from ._doc_intel_converter import DocumentIntelligenceConverter
__all__ = [
"DocumentConverter",
"DocumentConverterResult",
"PlainTextConverter",
"HtmlConverter",
"RssConverter",

View file

@ -1,12 +1,5 @@
from typing import Any, Union
class DocumentConverterResult:
"""The result of converting a document to text."""
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
self.title: Union[str, None] = title
self.text_content: str = text_content
from .._base_converter import DocumentConverterResult
class DocumentConverter:

View file

@ -6,7 +6,8 @@ from typing import Union
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._markdownify import _CustomMarkdownify
@ -81,6 +82,6 @@ class BingSerpConverter(DocumentConverter):
)
return DocumentConverterResult(
markdown=webpage_text,
title=None if soup.title is None else soup.title.string,
text_content=webpage_text,
)

View file

@ -2,7 +2,8 @@ from typing import Any, Union
import re
import sys
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from .._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
@ -103,7 +104,4 @@ class DocumentIntelligenceConverter(DocumentConverter):
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
return DocumentConverterResult(
title=None,
text_content=markdown_text,
)
return DocumentConverterResult(markdown=markdown_text)

View file

@ -2,11 +2,8 @@ import sys
from typing import Union
from ._base import (
DocumentConverterResult,
)
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

View file

@ -1,7 +1,8 @@
from typing import Any, Union
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._markdownify import _CustomMarkdownify
@ -51,6 +52,6 @@ class HtmlConverter(DocumentConverter):
webpage_text = webpage_text.strip()
return DocumentConverterResult(
markdown=webpage_text,
title=None if soup.title is None else soup.title.string,
text_content=webpage_text,
)

View file

@ -1,5 +1,6 @@
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._media_converter import MediaConverter
import base64
import mimetypes
@ -59,8 +60,7 @@ class ImageConverter(MediaConverter):
)
return DocumentConverterResult(
title=None,
text_content=md_content,
markdown=md_content,
)
def _get_llm_description(self, local_path, extension, client, model, prompt=None):

View file

@ -1,10 +1,8 @@
import json
from typing import Any, Union
from ._base import (
DocumentConverter,
DocumentConverterResult,
)
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from .._exceptions import FileConversionException
@ -65,8 +63,8 @@ class IpynbConverter(DocumentConverter):
title = notebook_content.get("metadata", {}).get("title", title)
return DocumentConverterResult(
markdown=md_text,
title=title,
text_content=md_text,
)
except Exception as e:

View file

@ -1,6 +1,7 @@
import tempfile
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings
@ -83,7 +84,4 @@ class Mp3Converter(WavConverter):
os.unlink(temp_path)
# Return the result
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
return DocumentConverterResult(markdown=md_content.strip())

View file

@ -1,6 +1,7 @@
import sys
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
@ -73,7 +74,8 @@ class OutlookMsgConverter(DocumentConverter):
msg.close()
return DocumentConverterResult(
title=headers.get("Subject"), text_content=md_content.strip()
markdown=md_content.strip(),
title=headers.get("Subject"),
)
except Exception as e:

View file

@ -1,6 +1,7 @@
import sys
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
@ -43,6 +44,5 @@ class PdfConverter(DocumentConverter):
) # Restore the original traceback
return DocumentConverterResult(
title=None,
text_content=pdfminer.high_level.extract_text(local_path),
markdown=pdfminer.high_level.extract_text(local_path)
)

View file

@ -3,7 +3,8 @@ import mimetypes
from charset_normalizer import from_path
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
# Mimetypes to ignore (commonly confused extensions)
@ -43,7 +44,4 @@ class PlainTextConverter(DocumentConverter):
return None
text_content = str(from_path(local_path).best())
return DocumentConverterResult(
title=None,
text_content=text_content,
)
return DocumentConverterResult(markdown=text_content)

View file

@ -5,7 +5,8 @@ import sys
from typing import Union
from ._base import DocumentConverterResult, DocumentConverter
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -174,10 +175,7 @@ class PptxConverter(HtmlConverter):
md_content += notes_frame.text
md_content = md_content.strip()
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
return DocumentConverterResult(markdown=md_content.strip())
def _is_picture(self, shape):
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:

View file

@ -3,7 +3,8 @@ from typing import Union
from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
class RssConverter(DocumentConverter):
@ -73,8 +74,8 @@ class RssConverter(DocumentConverter):
md_text += self._parse_content(entry_content)
return DocumentConverterResult(
markdown=md_text,
title=title,
text_content=md_text,
)
except BaseException as _:
return None
@ -117,8 +118,8 @@ class RssConverter(DocumentConverter):
md_text += self._parse_content(content)
return DocumentConverterResult(
markdown=md_text,
title=channel_title,
text_content=md_text,
)
except BaseException as _:
print(traceback.format_exc())

View file

@ -1,5 +1,6 @@
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._media_converter import MediaConverter
# Optional Transcription support
@ -60,10 +61,7 @@ class WavConverter(MediaConverter):
"\n\n### Audio Transcript:\nError. Could not transcribe this audio."
)
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
return DocumentConverterResult(markdown=md_content.strip())
def _transcribe_audio(self, local_path) -> str:
recognizer = sr.Recognizer()

View file

@ -3,7 +3,8 @@ import re
from typing import Any, Union
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._markdownify import _CustomMarkdownify
@ -56,6 +57,6 @@ class WikipediaConverter(DocumentConverter):
webpage_text = _CustomMarkdownify().convert_soup(soup)
return DocumentConverterResult(
markdown=webpage_text,
title=main_title,
text_content=webpage_text,
)

View file

@ -2,7 +2,8 @@ import sys
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@ -58,10 +59,7 @@ class XlsxConverter(HtmlConverter):
html_content = sheets[s].to_html(index=False)
md_content += self._convert(html_content).text_content.strip() + "\n\n"
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
return DocumentConverterResult(markdown=md_content.strip())
class XlsConverter(HtmlConverter):
@ -94,7 +92,4 @@ class XlsConverter(HtmlConverter):
html_content = sheets[s].to_html(index=False)
md_content += self._convert(html_content).text_content.strip() + "\n\n"
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
return DocumentConverterResult(markdown=md_content.strip())

View file

@ -7,7 +7,8 @@ from typing import Any, Union, Dict, List
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
# Optional YouTube transcription support
@ -158,8 +159,8 @@ class YouTubeConverter(DocumentConverter):
assert isinstance(title, str)
return DocumentConverterResult(
markdown=webpage_text,
title=title,
text_content=webpage_text,
)
def _get(

View file

@ -3,7 +3,8 @@ import zipfile
import shutil
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._base import DocumentConverter
from .._base_converter import DocumentConverterResult
class ZipConverter(DocumentConverter):
@ -62,8 +63,7 @@ class ZipConverter(DocumentConverter):
parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
markdown=f"[ERROR] No converters available to process zip contents from: {local_path}",
)
extracted_zip_folder_name = (
@ -118,27 +118,24 @@ class ZipConverter(DocumentConverter):
result = converter.convert(file_path, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"
md_content += result.markdown + "\n\n"
break
# Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True):
shutil.rmtree(extraction_dir)
return DocumentConverterResult(title=None, text_content=md_content.strip())
return DocumentConverterResult(markdown=md_content.strip())
except zipfile.BadZipFile:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
markdown=f"[ERROR] Invalid or corrupted zip file: {local_path}",
)
except ValueError as ve:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
markdown=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
)
except Exception as e:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
markdown=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
)