From 4129f30c23f4ea97b7c614ebe5e610667a2fb070 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 4 Mar 2025 00:52:57 -0800 Subject: [PATCH] More progress. --- .../markitdown/src/markitdown/__init__.py | 4 +- .../src/markitdown/_base_converter.py | 67 ++++++- .../markitdown/src/markitdown/_markitdown.py | 168 +++++++----------- .../src/markitdown/converters/__init__.py | 2 - .../src/markitdown/converters/_base.py | 56 ------ .../converters/_bing_serp_converter.py | 3 +- .../converters/_doc_intel_converter.py | 3 +- .../markitdown/converters/_docx_converter.py | 3 +- .../markitdown/converters/_html_converter.py | 3 +- .../markitdown/converters/_image_converter.py | 3 +- .../markitdown/converters/_ipynb_converter.py | 3 +- .../markitdown/converters/_media_converter.py | 2 +- .../markitdown/converters/_mp3_converter.py | 3 +- .../converters/_outlook_msg_converter.py | 3 +- .../markitdown/converters/_pdf_converter.py | 3 +- .../converters/_plain_text_converter.py | 3 +- .../markitdown/converters/_pptx_converter.py | 3 +- .../markitdown/converters/_rss_converter.py | 3 +- .../markitdown/converters/_wav_converter.py | 3 +- .../converters/_wikipedia_converter.py | 3 +- .../markitdown/converters/_xlsx_converter.py | 3 +- .../converters/_youtube_converter.py | 3 +- .../markitdown/converters/_zip_converter.py | 3 +- 23 files changed, 149 insertions(+), 201 deletions(-) delete mode 100644 packages/markitdown/src/markitdown/converters/_base.py diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index fb14feb..bb6fcdb 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -4,7 +4,7 @@ from .__about__ import __version__ from ._markitdown import MarkItDown -from ._base_converter import DocumentConverterResult, BaseDocumentConverter +from ._base_converter import DocumentConverterResult, DocumentConverter from ._stream_info import StreamInfo from ._exceptions import ( MarkItDownException, @@ -13,13 +13,11 @@ from ._exceptions import ( FileConversionException, UnsupportedFormatException, ) -from .converters import DocumentConverter __all__ = [ "__version__", "MarkItDown", "DocumentConverter", - "BaseDocumentConverter", "DocumentConverterResult", "MarkItDownException", "MissingDependencyException", diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index 7cd945f..42e5da7 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -1,5 +1,11 @@ +import os +import tempfile +from warnings import warn +from typing import Any, Union, BinaryIO, Optional, List from ._stream_info import StreamInfo -from typing import Any, Union, BinaryIO, Optional + +# Avoid printing the same warning multiple times +_WARNED: List[str] = [] class DocumentConverterResult: @@ -39,7 +45,7 @@ class DocumentConverterResult: return self.markdown -class BaseDocumentConverter: +class DocumentConverter: """Abstract superclass of all DocumentConverters.""" # Lower priority values are tried first. @@ -74,7 +80,7 @@ class BaseDocumentConverter: """ self._priority = priority - def convert( + def convert_stream( self, file_stream: BinaryIO, stream_info: StreamInfo, @@ -106,6 +112,61 @@ class BaseDocumentConverter: - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. - MissingDependencyException: If the converter requires a dependency that is not installed. """ + + # Default implementation ensures backward compatibility with the legacy convert() method, and + # should absolutely be overridden in subclasses. This behavior is deprecated and will be removed + # in the future. + result = None + used_legacy = False + + if stream_info.local_path is not None and os.path.exists( + stream_info.local_path + ): + # If the stream is backed by a local file, pass it to the legacy convert() method + try: + result = self.convert(stream_info.local_path, **kwargs) + used_legacy = True + except ( + NotImplementedError + ): # If it wasn't implemented, rethrow the error, but with this as the stack trace + raise NotImplementedError( + "Subclasses must implement the convert_stream method." + ) + else: + # Otherwise, we need to read the stream into a temporary file. There is potential for + # thrashing here if there are many converters or conversion attempts + cur_pos = file_stream.tell() + temp_fd, temp_path = tempfile.mkstemp() + try: + with os.fdopen(temp_fd, "wb") as temp_file: + temp_file.write(file_stream.read()) + try: + result = self.convert(temp_path, **kwargs) + used_legacy = True + except NotImplementedError: + raise NotImplementedError( + "Subclasses must implement the convert_stream method." + ) + finally: + os.remove(temp_path) + file_stream.seek(0) + + if used_legacy: + message = f"{type(self).__name__} uses the legacy convert() method, which is deprecated." + if message not in _WARNED: + warn(message, DeprecationWarning) + _WARNED.append(message) + + return result + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + """ + Legacy, and deprecated method to convert a document to Markdown text. + This method reads from the file at `local_path` and returns the converted Markdown text. + This method is deprecated in favor of `convert_stream`, which uses a file-like object. + """ raise NotImplementedError("Subclasses must implement this method") @property diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 2738535..d5cd0aa 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -20,7 +20,6 @@ import requests from ._stream_info import StreamInfo from .converters import ( - DocumentConverter, PlainTextConverter, HtmlConverter, RssConverter, @@ -41,7 +40,7 @@ from .converters import ( DocumentIntelligenceConverter, ) -from ._base_converter import DocumentConverterResult +from ._base_converter import DocumentConverter, DocumentConverterResult from ._exceptions import ( FileConversionException, @@ -102,7 +101,7 @@ class MarkItDown: self._style_map = None # Register the converters - self._page_converters: List[DocumentConverter] = [] + self._converters: List[DocumentConverter] = [] if ( enable_builtins is None or enable_builtins @@ -405,108 +404,73 @@ class MarkItDown: def _convert( self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs ) -> DocumentConverterResult: - # Lazily create a temporary file, if needed, for backward compatibility - # This is to support a deprecated feature, and will be removed in the future - temp_file = None + res: Union[None, DocumentConverterResult] = None - def get_temp_file(): - nonlocal temp_file + # Keep track of which converters throw exceptions + failed_attempts: List[FailedConversionAttempt] = [] - if temp_file is not None: - return temp_file - else: + # Create a copy of the page_converters list, sorted by priority. + # We do this with each call to _convert because the priority of converters may change between calls. + # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. + sorted_converters = sorted(self._converters, key=lambda x: x.priority) + + for stream_info in stream_info_guesses + [StreamInfo()]: + for converter in sorted_converters: + _kwargs = copy.deepcopy(kwargs) + + # Copy any additional global options + if "llm_client" not in _kwargs and self._llm_client is not None: + _kwargs["llm_client"] = self._llm_client + + if "llm_model" not in _kwargs and self._llm_model is not None: + _kwargs["llm_model"] = self._llm_model + + if "style_map" not in _kwargs and self._style_map is not None: + _kwargs["style_map"] = self._style_map + + if "exiftool_path" not in _kwargs and self._exiftool_path is not None: + _kwargs["exiftool_path"] = self._exiftool_path + + # Add the list of converters for nested processing + _kwargs["_parent_converters"] = self._converters + + # Add legaxy kwargs + if stream_info is not None: + if stream_info.extension is not None: + _kwargs["file_extension"] = stream_info.extension + + if stream_info.url is not None: + _kwargs["url"] = stream_info.url + + # Attempt the conversion cur_pos = file_stream.tell() - handle, temp_file = tempfile.mkstemp() - fh = os.fdopen(handle, "wb") - file_stream.seek(0) - fh.write(file_stream.read()) - file_stream.seek(cur_pos) - fh.close() - return temp_file - - try: - res: Union[None, DocumentConverterResult] = None - - # Keep track of which converters throw exceptions - failed_attempts: List[FailedConversionAttempt] = [] - - # Create a copy of the page_converters list, sorted by priority. - # We do this with each call to _convert because the priority of converters may change between calls. - # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. - sorted_converters = sorted(self._page_converters, key=lambda x: x.priority) - - for file_info in stream_info_guesses + [None]: - for converter in sorted_converters: - _kwargs = copy.deepcopy(kwargs) - - # Copy any additional global options - if "llm_client" not in _kwargs and self._llm_client is not None: - _kwargs["llm_client"] = self._llm_client - - if "llm_model" not in _kwargs and self._llm_model is not None: - _kwargs["llm_model"] = self._llm_model - - if "style_map" not in _kwargs and self._style_map is not None: - _kwargs["style_map"] = self._style_map - - if ( - "exiftool_path" not in _kwargs - and self._exiftool_path is not None - ): - _kwargs["exiftool_path"] = self._exiftool_path - - # Add the list of converters for nested processing - _kwargs["_parent_converters"] = self._page_converters - - # Add backwards compatibility - if isinstance(converter, DocumentConverter): - if file_info is not None: - # Legacy converters need a file_extension - if file_info.extension is not None: - _kwargs["file_extension"] = file_info.extension - - # And benefit from urls, when available - if file_info.url is not None: - _kwargs["url"] = file_info.url - - try: - res = converter.convert(get_temp_file(), **_kwargs) - except Exception: - failed_attempts.append( - FailedConversionAttempt( - converter=converter, exc_info=sys.exc_info() - ) - ) - else: - raise NotImplementedError("TODO") - - if res is not None: - # Normalize the content - res.text_content = "\n".join( - [ - line.rstrip() - for line in re.split(r"\r?\n", res.text_content) - ] - ) - res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) - return res - - # If we got this far without success, report any exceptions - if len(failed_attempts) > 0: - raise FileConversionException(attempts=failed_attempts) - - # Nothing can handle it! - raise UnsupportedFormatException( - f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." - ) - - finally: - # Clean up the temporary file - if temp_file is not None: try: - os.unlink(temp_file) + res = converter.convert_stream(file_stream, stream_info, **_kwargs) except Exception: - pass + failed_attempts.append( + FailedConversionAttempt( + converter=converter, exc_info=sys.exc_info() + ) + ) + finally: + file_stream.seek(cur_pos) + + if res is not None: + # Normalize the content + res.text_content = "\n".join( + [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] + ) + res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) + return res + + # If we got this far without success, report any exceptions + if len(failed_attempts) > 0: + raise FileConversionException(attempts=failed_attempts) + + # Nothing can handle it! + raise UnsupportedFormatException( + f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." + ) def register_page_converter(self, converter: DocumentConverter) -> None: """DEPRECATED: User register_converter instead.""" @@ -516,6 +480,6 @@ class MarkItDown: ) self.register_converter(converter) - def register_converter(self, converter: DocumentConverter) -> None: + def register_converter(self, converter: Union[DocumentConverter]) -> None: """Register a page text converter.""" - self._page_converters.insert(0, converter) + self._converters.insert(0, converter) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 996b78b..038038d 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: MIT -from ._base import DocumentConverter from ._plain_text_converter import PlainTextConverter from ._html_converter import HtmlConverter from ._rss_converter import RssConverter @@ -22,7 +21,6 @@ from ._zip_converter import ZipConverter from ._doc_intel_converter import DocumentIntelligenceConverter __all__ = [ - "DocumentConverter", "PlainTextConverter", "HtmlConverter", "RssConverter", diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py deleted file mode 100644 index e1a544a..0000000 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ /dev/null @@ -1,56 +0,0 @@ -from typing import Any, Union -from .._base_converter import DocumentConverterResult - - -class DocumentConverter: - """Abstract superclass of all DocumentConverters.""" - - # Lower priority values are tried first. - PRIORITY_SPECIFIC_FILE_FORMAT = ( - 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia - ) - PRIORITY_GENERIC_FILE_FORMAT = ( - 10.0 # Near catch-all converters for mimetypes like text/*, etc. - ) - - def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): - """ - Initialize the DocumentConverter with a given priority. - - Priorities work as follows: By default, most converters get priority - DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception - is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), - with lower values being tried first (i.e., higher priority). - - Just prior to conversion, the converters are sorted by priority, using - a stable sort. This means that converters with the same priority will - remain in the same order, with the most recently registered converters - appearing first. - - We have tight control over the order of built-in converters, but - plugins can register converters in any order. A converter's priority - field reasserts some control over the order of converters. - - Plugins can register converters with any priority, to appear before or - after the built-ins. For example, a plugin with priority 9 will run - before the PlainTextConverter, but after the built-in converters. - """ - self._priority = priority - - def convert( - self, local_path: str, **kwargs: Any - ) -> Union[None, DocumentConverterResult]: - raise NotImplementedError("Subclasses must implement this method") - - @property - def priority(self) -> float: - """Priority of the converter in markitdown's converter list. Higher priority values are tried first.""" - return self._priority - - @priority.setter - def priority(self, value: float): - self._priority = value - - @priority.deleter - def priority(self): - raise AttributeError("Cannot delete the priority attribute") diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index bdb15bf..2ac8e7e 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -6,8 +6,7 @@ from typing import Union from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._markdownify import _CustomMarkdownify diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 1ad8981..3129409 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -2,8 +2,7 @@ from typing import Any, Union import re import sys -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException # Try loading optional (but in this case, required) dependencies diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index ea2550b..8f298ab 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -2,8 +2,7 @@ import sys from typing import Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 64efb9a..172875e 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -1,8 +1,7 @@ from typing import Any, Union from bs4 import BeautifulSoup -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._markdownify import _CustomMarkdownify diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index 5923103..72f70e2 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,6 +1,5 @@ from typing import Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter import base64 import mimetypes diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index cc40d4e..2c5cb3f 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -1,8 +1,7 @@ import json from typing import Any, Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import FileConversionException diff --git a/packages/markitdown/src/markitdown/converters/_media_converter.py b/packages/markitdown/src/markitdown/converters/_media_converter.py index 5c7d82b..0a5cebf 100644 --- a/packages/markitdown/src/markitdown/converters/_media_converter.py +++ b/packages/markitdown/src/markitdown/converters/_media_converter.py @@ -3,7 +3,7 @@ import shutil import json from warnings import warn -from ._base import DocumentConverter +from .._base_converter import DocumentConverter class MediaConverter(DocumentConverter): diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py index a2276b6..6ba2202 100644 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py @@ -1,7 +1,6 @@ import tempfile from typing import Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._wav_converter import WavConverter from warnings import resetwarnings, catch_warnings diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index 4abc860..84d8c47 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -1,7 +1,6 @@ import sys from typing import Any, Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 2767954..00228b5 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,7 +1,6 @@ import sys from typing import Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 5905851..a9f1902 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -3,8 +3,7 @@ import mimetypes from charset_normalizer import from_path from typing import Any, Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult # Mimetypes to ignore (commonly confused extensions) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index 99e4337..d77d3bc 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -5,8 +5,7 @@ import sys from typing import Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 2471799..021d09d 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -3,8 +3,7 @@ from typing import Union from bs4 import BeautifulSoup from ._markdownify import _CustomMarkdownify -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult class RssConverter(DocumentConverter): diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py index 4278f6f..c14a9a3 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py @@ -1,6 +1,5 @@ from typing import Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter # Optional Transcription support diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index b4665c0..2be066d 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -3,8 +3,7 @@ import re from typing import Any, Union from bs4 import BeautifulSoup -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._markdownify import _CustomMarkdownify diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 7257768..37535ca 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -2,8 +2,7 @@ import sys from typing import Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index 485b095..975d668 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -7,8 +7,7 @@ from typing import Any, Union, Dict, List from urllib.parse import parse_qs, urlparse from bs4 import BeautifulSoup -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult # Optional YouTube transcription support diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index d8f2951..7c8c6db 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -3,8 +3,7 @@ import zipfile import shutil from typing import Any, Union -from ._base import DocumentConverter -from .._base_converter import DocumentConverterResult +from .._base_converter import DocumentConverter, DocumentConverterResult class ZipConverter(DocumentConverter):