file change

2025-04-21 12:18:36 +00:00 · 2025-04-21 12:18:36 +00:00 · 1e36bd8fc1
commit 1e36bd8fc1
parent 555a849a66
10 changed files with 121 additions and 903 deletions
--- a/packages/markitup/src/markitup/init.py
+++ b/packages/markitup/src/markitup/init.py
@ -5,8 +5,6 @@
 from .__about__ import __version__
 from ._markitup import (
    MarkItUp,
    PRIORITY_SPECIFIC_FILE_FORMAT,
    PRIORITY_GENERIC_FILE_FORMAT,
 )
 from ._base_converter import DocumentConverterResult, DocumentConverter
 from ._stream_info import StreamInfo
@ -29,6 +27,4 @@ __all__ = [
    "FileConversionException",
    "UnsupportedFormatException",
    "StreamInfo",
    "PRIORITY_SPECIFIC_FILE_FORMAT",
    "PRIORITY_GENERIC_FILE_FORMAT",
 ]
--- a/packages/markitup/src/markitup/_base_converter.py
+++ b/packages/markitup/src/markitup/_base_converter.py
@ -1,8 +1,9 @@
 import os
 import tempfile
 from warnings import warn
-from typing import Any, Union, BinaryIO, Optional, List
+from typing import Any, Union, BinaryIO, Optional, List, Dict
 from ._stream_info import StreamInfo
 import re
 class DocumentConverterResult:
@ -27,6 +28,61 @@ class DocumentConverterResult:
        self.markdown = markdown
        self.title = title
    def to_llm(self) -> List[Dict[str, Any]]:
        """
        Convert markdown with base64 images to a format compatible with OpenAI's API.
        This function parses the markdown content, extracting text and images in their
        original order, and returns a list of content elements in OpenAI's format.
        Returns:
            List[Dict[str, Any]]: A list of dictionaries representing the content elements
                                (text and images) in their original order.
        """
        # Pattern to match markdown image syntax with base64 data
        pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
        content = []
        last_end = 0
        # Process the document sequentially to maintain order
        for match in re.finditer(pattern, self.markdown):
            # Add the text before this image if any
            if match.start() > last_end:
                text_chunk = self.markdown[last_end:match.start()].strip()
                if text_chunk:
                    content.append({
                        "type": "text",
                        "text": text_chunk
                    })
            # Extract image data
            alt_text, content_type, b64_data = match.groups()
            # Add the image
            content.append({
                "type": "image",
                "image_url": {
                    "url": f"data:{content_type};base64,{b64_data}"
                },
                "alt_text": alt_text
            })
            last_end = match.end()
        # Add any remaining text after the last image
        if last_end < len(self.markdown):
            text_chunk = self.markdown[last_end:].strip()
            if text_chunk:
                content.append({
                    "type": "text",
                    "text": text_chunk
                })
        return content
    @property
    def text_content(self) -> str:
        """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
@ -45,45 +101,6 @@ class DocumentConverterResult:
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        """
        Return a quick determination on if the converter should attempt converting the document.
        This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
        In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
        make a determination (e.g., special converters for Wikipedia, YouTube etc).
        Finally, it is conceivable that the `stream_info.filename` might be used to in cases
        where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
        NOTE: The method signature is designed to match that of the convert() method. This provides some
        assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
        IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final
        determination. Read operations inevitably advances the position in file_stream. In these case, the position
        MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately
        after accepts(), and will expect the file_stream to be at the original position.
        E.g.,
        cur_pos = file_stream.tell() # Save the current position
        data = file_stream.read(100) # ... peek at the first 100 bytes, etc.
        file_stream.seek(cur_pos)    # Reset the position to the original position
        Prameters:
        - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
        - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
        - kwargs: Additional keyword arguments for the converter.
        Returns:
        - bool: True if the converter can handle the document, False otherwise.
        """
        raise NotImplementedError(
            f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
        )
    def convert(
        self,
        file_stream: BinaryIO,
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@ -1,26 +1,10 @@
 import copy
 import mimetypes
 import os
 import re
 import sys
 import shutil
 import tempfile
 import warnings
 import traceback
 import io
 from dataclasses import dataclass
 from importlib.metadata import entry_points
 from typing import Any, List, Dict, Optional, Union, BinaryIO
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
-import requests
+import magic
 import magika
 import charset_normalizer
 import codecs
 from ._stream_info import StreamInfo
 from ._uri_utils import parse_data_uri, file_uri_to_path
 from .converters import (
    PlainTextConverter,
@ -43,718 +27,74 @@ from ._exceptions import (
 )
 # Lower priority values are tried first.
 PRIORITY_SPECIFIC_FILE_FORMAT = (
    0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
 )
 PRIORITY_GENERIC_FILE_FORMAT = (
    10.0  # Near catch-all converters for mimetypes like text/*, etc.
 )
 _plugins: Union[None, List[Any]] = None  # If None, plugins have not been loaded yet.
 def _load_plugins() -> Union[None, List[Any]]:
    """Lazy load plugins, exiting early if already loaded."""
    global _plugins
    # Skip if we've already loaded plugins
    if _plugins is not None:
        return _plugins
    # Load plugins
    _plugins = []
    for entry_point in entry_points(group="markitup.plugin"):
        try:
            _plugins.append(entry_point.load())
        except Exception:
            tb = traceback.format_exc()
            warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
    return _plugins
@dataclass(kw_only=True, frozen=True)
 class ConverterRegistration:
    """A registration of a converter with its priority and other metadata."""
    converter: DocumentConverter
    priority: float
 class MarkItUp:
    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
    This reader will convert common file-types or webpages to Markdown."""
    def __init__(
        self,
-        *,
+        config: Optional[Dict[str, Any]] = None,
        enable_builtins: Union[None, bool] = None,
        enable_plugins: Union[None, bool] = None,
        **kwargs,
    ):
-        self._builtins_enabled = False
+        self.config = config
        self._plugins_enabled = False
-        requests_session = kwargs.get("requests_session")
+    def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
-        if requests_session is None:
+        stream_info: StreamInfo = self._get_stream_info(stream)
-            self._requests_session = requests.Session()
+        # Deal with unsupported file types
-        else:
+        match stream_info.category:
-            self._requests_session = requests_session
+            case "ppt":
                raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
            case "other":
                raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
        self._magika = magika.Magika()
        # TODO - remove these (see enable_builtins)
        self._llm_client: Any = None
        self._llm_model: Union[str | None] = None
        self._exiftool_path: Union[str | None] = None
        self._style_map: Union[str | None] = None
        # Register the converters
        self._converters: List[ConverterRegistration] = []
        if (
            enable_builtins is None or enable_builtins
        ):  # Default to True when not specified
            self.enable_builtins(**kwargs)
        if enable_plugins:
            self.enable_plugins(**kwargs)
    def enable_builtins(self, **kwargs) -> None:
        """
        Enable and register built-in converters.
        Built-in converters are enabled by default.
        This method should only be called once, if built-ins were initially disabled.
        """
        if not self._builtins_enabled:
            # TODO: Move these into converter constructors
            self._llm_client = kwargs.get("llm_client")
            self._llm_model = kwargs.get("llm_model")
            self._exiftool_path = kwargs.get("exiftool_path")
            self._style_map = kwargs.get("style_map")
            if self._exiftool_path is None:
                self._exiftool_path = os.getenv("EXIFTOOL_PATH")
            # Still none? Check well-known paths
            if self._exiftool_path is None:
                candidate = shutil.which("exiftool")
                if candidate:
                    candidate = os.path.abspath(candidate)
                    if any(
                        d == os.path.dirname(candidate)
                        for d in [
                            "/usr/bin",
                            "/usr/local/bin",
                            "/opt",
                            "/opt/bin",
                            "/opt/local/bin",
                            "/opt/homebrew/bin",
                            "C:\\Windows\\System32",
                            "C:\\Program Files",
                            "C:\\Program Files (x86)",
                        ]
                    ):
                        self._exiftool_path = candidate
            # Register converters for successful browsing operations
            # Later registrations are tried first / take higher priority than earlier registrations
            # To this end, the most specific converters should appear below the most generic converters
            self.register_converter(
                PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
            )
            self.register_converter(
                ZipConverter(markitup=self), priority=PRIORITY_GENERIC_FILE_FORMAT
            )
            self.register_converter(
                HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
            )
            self.register_converter(RssConverter())
            self.register_converter(WikipediaConverter())
            self.register_converter(YouTubeConverter())
            self.register_converter(BingSerpConverter())
            self.register_converter(DocxConverter())
            self.register_converter(XlsxConverter())
            self.register_converter(XlsConverter())
            self.register_converter(PptxConverter())
            self.register_converter(AudioConverter())
            self.register_converter(ImageConverter())
            self.register_converter(IpynbConverter())
            self.register_converter(PdfConverter())
            self.register_converter(OutlookMsgConverter())
            self.register_converter(EpubConverter())
            self.register_converter(CsvConverter())
            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
            if docintel_endpoint is not None:
                docintel_args: Dict[str, Any] = {}
                docintel_args["endpoint"] = docintel_endpoint
                docintel_credential = kwargs.get("docintel_credential")
                if docintel_credential is not None:
                    docintel_args["credential"] = docintel_credential
                docintel_types = kwargs.get("docintel_file_types")
                if docintel_types is not None:
                    docintel_args["file_types"] = docintel_types
                self.register_converter(
                    DocumentIntelligenceConverter(**docintel_args),
                )
            self._builtins_enabled = True
        else:
            warn("Built-in converters are already enabled.", RuntimeWarning)
    def enable_plugins(self, **kwargs) -> None:
        """
        Enable and register converters provided by plugins.
        Plugins are disabled by default.
        This method should only be called once, if plugins were initially disabled.
        """
        if not self._plugins_enabled:
            # Load plugins
            plugins = _load_plugins()
            assert plugins is not None
            for plugin in plugins:
        try:
-                    plugin.register_converters(self, **kwargs)
+            match stream_info.category:
-                except Exception:
+                case "text":
-                    tb = traceback.format_exc()
+                    return PlainTextConverter().convert(stream, stream_info), stream_info
-                    warn(f"Plugin '{plugin}' failed to register converters:\n{tb}")
+                case "pptx":
-            self._plugins_enabled = True
+                    return PptxConverter().convert(stream, stream_info), stream_info
                case "pdf":
                    return PdfConverter().convert(stream, stream_info), stream_info
        except FailedConversionAttempt:
            raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
        return stream_info
    def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
        original_position = byte_stream.tell()
        # Reset stream position to beginning
        byte_stream.seek(0)
        # Get file content for analysis
        file_content = byte_stream.read()
        # Use python-magic to determine file type based on content
        magic_type = magic.from_buffer(file_content, mime=True)
        # Determine file category based on magic_type
        if magic_type.startswith("image/"):
            category = "image"
        elif magic_type.startswith("audio/"):
            category = "audio"
        elif magic_type.startswith("video/"):
            category = "video"
        elif magic_type.startswith("application/vnd.ms-excel"):
            category = 'xls'
        elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
            category = "xlsx"
        elif magic_type.startswith("application/vnd.ms-powerpoint"):
            category = 'ppt'
        elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
            category = "pptx"
        elif magic_type.startswith("application/msword"):
            category = 'doc'
        elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            category = "docx"
        elif magic_type == "application/pdf":
            category = "pdf"
        elif magic_type.startswith("text/"):
            category = "text"
        else:
-            warn("Plugins converters are already enabled.", RuntimeWarning)
+            category = "other"
-    def convert(
+        byte_stream.seek(original_position)
-        self,
+        return StreamInfo(magic_type=magic_type, category=category)
        source: Union[str, requests.Response, Path, BinaryIO],
        *,
        stream_info: Optional[StreamInfo] = None,
        **kwargs: Any,
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        """
        Args:
            - source: can be a path (str or Path), url, or a requests.response object
            - stream_info: optional stream info to use for the conversion. If None, infer from source
            - kwargs: additional arguments to pass to the converter
        """
        # Local path or url
        if isinstance(source, str):
            if (
                source.startswith("http:")
                or source.startswith("https:")
                or source.startswith("file:")
                or source.startswith("data:")
            ):
                # Rename the url argument to mock_url
                # (Deprecated -- use stream_info)
                _kwargs = {k: v for k, v in kwargs.items()}
                if "url" in _kwargs:
                    _kwargs["mock_url"] = _kwargs["url"]
                    del _kwargs["url"]
                return self.convert_uri(source, stream_info=stream_info, **_kwargs)
            else:
                return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Path object
        elif isinstance(source, Path):
            return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Request response
        elif isinstance(source, requests.Response):
            return self.convert_response(source, stream_info=stream_info, **kwargs)
        # Binary stream
        elif (
            hasattr(source, "read")
            and callable(source.read)
            and not isinstance(source, io.TextIOBase)
        ):
            return self.convert_stream(source, stream_info=stream_info, **kwargs)
        else:
            raise TypeError(
                f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
            )
    def convert_local(
        self,
        path: Union[str, Path],
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        url: Optional[str] = None,  # Deprecated -- use stream_info
        **kwargs: Any,
    ) -> DocumentConverterResult:
        if isinstance(path, Path):
            path = str(path)
        # Build a base StreamInfo object from which to start guesses
        base_guess = StreamInfo(
            local_path=path,
            extension=os.path.splitext(path)[1],
            filename=os.path.basename(path),
        )
        # Extend the base_guess with any additional info from the arguments
        if stream_info is not None:
            base_guess = base_guess.copy_and_update(stream_info)
        if file_extension is not None:
            # Deprecated -- use stream_info
            base_guess = base_guess.copy_and_update(extension=file_extension)
        if url is not None:
            # Deprecated -- use stream_info
            base_guess = base_guess.copy_and_update(url=url)
        with open(path, "rb") as fh:
            guesses = self._get_stream_info_guesses(
                file_stream=fh, base_guess=base_guess
            )
            return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
    def convert_stream(
        self,
        stream: BinaryIO,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        url: Optional[str] = None,  # Deprecated -- use stream_info
        **kwargs: Any,
    ) -> DocumentConverterResult:
        guesses: List[StreamInfo] = []
        # Do we have anything on which to base a guess?
        base_guess = None
        if stream_info is not None or file_extension is not None or url is not None:
            # Start with a non-Null base guess
            if stream_info is None:
                base_guess = StreamInfo()
            else:
                base_guess = stream_info
            if file_extension is not None:
                # Deprecated -- use stream_info
                assert base_guess is not None  # for mypy
                base_guess = base_guess.copy_and_update(extension=file_extension)
            if url is not None:
                # Deprecated -- use stream_info
                assert base_guess is not None  # for mypy
                base_guess = base_guess.copy_and_update(url=url)
        # Check if we have a seekable stream. If not, load the entire stream into memory.
        if not stream.seekable():
            buffer = io.BytesIO()
            while True:
                chunk = stream.read(4096)
                if not chunk:
                    break
                buffer.write(chunk)
            buffer.seek(0)
            stream = buffer
        # Add guesses based on stream content
        guesses = self._get_stream_info_guesses(
            file_stream=stream, base_guess=base_guess or StreamInfo()
        )
        return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
    def convert_url(
        self,
        url: str,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,
        mock_url: Optional[str] = None,
        **kwargs: Any,
    ) -> DocumentConverterResult:
        """Alias for convert_uri()"""
        # convert_url will likely be deprecated in the future in favor of convert_uri
        return self.convert_uri(
            url,
            stream_info=stream_info,
            file_extension=file_extension,
            mock_url=mock_url,
            **kwargs,
        )
    def convert_uri(
        self,
        uri: str,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        mock_url: Optional[
            str
        ] = None,  # Mock the request as if it came from a different URL
        **kwargs: Any,
    ) -> DocumentConverterResult:
        uri = uri.strip()
        # File URIs
        if uri.startswith("file:"):
            netloc, path = file_uri_to_path(uri)
            if netloc and netloc != "localhost":
                raise ValueError(
                    f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
                )
            return self.convert_local(
                path,
                stream_info=stream_info,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        # Data URIs
        elif uri.startswith("data:"):
            mimetype, attributes, data = parse_data_uri(uri)
            base_guess = StreamInfo(
                mimetype=mimetype,
                charset=attributes.get("charset"),
            )
            if stream_info is not None:
                base_guess = base_guess.copy_and_update(stream_info)
            return self.convert_stream(
                io.BytesIO(data),
                stream_info=base_guess,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        # HTTP/HTTPS URIs
        elif uri.startswith("http:") or uri.startswith("https:"):
            response = self._requests_session.get(uri, stream=True)
            response.raise_for_status()
            return self.convert_response(
                response,
                stream_info=stream_info,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        else:
            raise ValueError(
                f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
            )
    def convert_response(
        self,
        response: requests.Response,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        url: Optional[str] = None,  # Deprecated -- use stream_info
        **kwargs: Any,
    ) -> DocumentConverterResult:
        # If there is a content-type header, get the mimetype and charset (if present)
        mimetype: Optional[str] = None
        charset: Optional[str] = None
        if "content-type" in response.headers:
            parts = response.headers["content-type"].split(";")
            mimetype = parts.pop(0).strip()
            for part in parts:
                if part.strip().startswith("charset="):
                    _charset = part.split("=")[1].strip()
                    if len(_charset) > 0:
                        charset = _charset
        # If there is a content-disposition header, get the filename and possibly the extension
        filename: Optional[str] = None
        extension: Optional[str] = None
        if "content-disposition" in response.headers:
            m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
            if m:
                filename = m.group(1).strip("\"'")
                _, _extension = os.path.splitext(filename)
                if len(_extension) > 0:
                    extension = _extension
        # If there is still no filename, try to read it from the url
        if filename is None:
            parsed_url = urlparse(response.url)
            _, _extension = os.path.splitext(parsed_url.path)
            if len(_extension) > 0:  # Looks like this might be a file!
                filename = os.path.basename(parsed_url.path)
                extension = _extension
        # Create an initial guess from all this information
        base_guess = StreamInfo(
            mimetype=mimetype,
            charset=charset,
            filename=filename,
            extension=extension,
            url=response.url,
        )
        # Update with any additional info from the arguments
        if stream_info is not None:
            base_guess = base_guess.copy_and_update(stream_info)
        if file_extension is not None:
            # Deprecated -- use stream_info
            base_guess = base_guess.copy_and_update(extension=file_extension)
        if url is not None:
            # Deprecated -- use stream_info
            base_guess = base_guess.copy_and_update(url=url)
        # Read into BytesIO
        buffer = io.BytesIO()
        for chunk in response.iter_content(chunk_size=512):
            buffer.write(chunk)
        buffer.seek(0)
        # Convert
        guesses = self._get_stream_info_guesses(
            file_stream=buffer, base_guess=base_guess
        )
        return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
    def _convert(
        self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
    ) -> DocumentConverterResult:
        res: Union[None, DocumentConverterResult] = None
        # Keep track of which converters throw exceptions
        failed_attempts: List[FailedConversionAttempt] = []
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
        sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
        # Remember the initial stream position so that we can return to it
        cur_pos = file_stream.tell()
        for stream_info in stream_info_guesses + [StreamInfo()]:
            for converter_registration in sorted_registrations:
                converter = converter_registration.converter
                # Sanity check -- make sure the cur_pos is still the same
                assert (
                    cur_pos == file_stream.tell()
                ), f"File stream position should NOT change between guess iterations"
                _kwargs = {k: v for k, v in kwargs.items()}
                # Copy any additional global options
                if "llm_client" not in _kwargs and self._llm_client is not None:
                    _kwargs["llm_client"] = self._llm_client
                if "llm_model" not in _kwargs and self._llm_model is not None:
                    _kwargs["llm_model"] = self._llm_model
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map
                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
                    _kwargs["exiftool_path"] = self._exiftool_path
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._converters
                # Add legaxy kwargs
                if stream_info is not None:
                    if stream_info.extension is not None:
                        _kwargs["file_extension"] = stream_info.extension
                    if stream_info.url is not None:
                        _kwargs["url"] = stream_info.url
                # Check if the converter will accept the file, and if so, try to convert it
                _accepts = False
                try:
                    _accepts = converter.accepts(file_stream, stream_info, **_kwargs)
                except NotImplementedError:
                    pass
                # accept() should not have changed the file stream position
                assert (
                    cur_pos == file_stream.tell()
                ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
                # Attempt the conversion
                if _accepts:
                    try:
                        res = converter.convert(file_stream, stream_info, **_kwargs)
                    except Exception:
                        failed_attempts.append(
                            FailedConversionAttempt(
                                converter=converter, exc_info=sys.exc_info()
                            )
                        )
                    finally:
                        file_stream.seek(cur_pos)
                if res is not None:
                    # Normalize the content
                    res.text_content = "\n".join(
                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
                    )
                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
                    return res
        # If we got this far without success, report any exceptions
        if len(failed_attempts) > 0:
            raise FileConversionException(attempts=failed_attempts)
        # Nothing can handle it!
        raise UnsupportedFormatException(
            f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
        )
    def register_page_converter(self, converter: DocumentConverter) -> None:
        """DEPRECATED: User register_converter instead."""
        warn(
            "register_page_converter is deprecated. Use register_converter instead.",
            DeprecationWarning,
        )
        self.register_converter(converter)
    def register_converter(
        self,
        converter: DocumentConverter,
        *,
        priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
    ) -> None:
        """
        Register a DocumentConverter with a given priority.
        Priorities work as follows: By default, most converters get priority
        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
        is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
        priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
        being tried first (i.e., higher priority).
        Just prior to conversion, the converters are sorted by priority, using
        a stable sort. This means that converters with the same priority will
        remain in the same order, with the most recently registered converters
        appearing first.
        We have tight control over the order of built-in converters, but
        plugins can register converters in any order. The registration's priority
        field reasserts some control over the order of converters.
        Plugins can register converters with any priority, to appear before or
        after the built-ins. For example, a plugin with priority 9 will run
        before the PlainTextConverter, but after the built-in converters.
        """
        self._converters.insert(
            0, ConverterRegistration(converter=converter, priority=priority)
        )
    def _get_stream_info_guesses(
        self, file_stream: BinaryIO, base_guess: StreamInfo
    ) -> List[StreamInfo]:
        """
        Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika).
        """
        guesses: List[StreamInfo] = []
        # Enhance the base guess with information based on the extension or mimetype
        enhanced_guess = base_guess.copy_and_update()
        # If there's an extension and no mimetype, try to guess the mimetype
        if base_guess.mimetype is None and base_guess.extension is not None:
            _m, _ = mimetypes.guess_type(
                "placeholder" + base_guess.extension, strict=False
            )
            if _m is not None:
                enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m)
        # If there's a mimetype and no extension, try to guess the extension
        if base_guess.mimetype is not None and base_guess.extension is None:
            _e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False)
            if len(_e) > 0:
                enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0])
        # Call magika to guess from the stream
        cur_pos = file_stream.tell()
        try:
            result = self._magika.identify_stream(file_stream)
            if result.status == "ok" and result.prediction.output.label != "unknown":
                # If it's text, also guess the charset
                charset = None
                if result.prediction.output.is_text:
                    # Read the first 4k to guess the charset
                    file_stream.seek(cur_pos)
                    stream_page = file_stream.read(4096)
                    charset_result = charset_normalizer.from_bytes(stream_page).best()
                    if charset_result is not None:
                        charset = self._normalize_charset(charset_result.encoding)
                # Normalize the first extension listed
                guessed_extension = None
                if len(result.prediction.output.extensions) > 0:
                    guessed_extension = "." + result.prediction.output.extensions[0]
                # Determine if the guess is compatible with the base guess
                compatible = True
                if (
                    base_guess.mimetype is not None
                    and base_guess.mimetype != result.prediction.output.mime_type
                ):
                    compatible = False
                if (
                    base_guess.extension is not None
                    and base_guess.extension.lstrip(".")
                    not in result.prediction.output.extensions
                ):
                    compatible = False
                if (
                    base_guess.charset is not None
                    and self._normalize_charset(base_guess.charset) != charset
                ):
                    compatible = False
                if compatible:
                    # Add the compatible base guess
                    guesses.append(
                        StreamInfo(
                            mimetype=base_guess.mimetype
                            or result.prediction.output.mime_type,
                            extension=base_guess.extension or guessed_extension,
                            charset=base_guess.charset or charset,
                            filename=base_guess.filename,
                            local_path=base_guess.local_path,
                            url=base_guess.url,
                        )
                    )
                else:
                    # The magika guess was incompatible with the base guess, so add both guesses
                    guesses.append(enhanced_guess)
                    guesses.append(
                        StreamInfo(
                            mimetype=result.prediction.output.mime_type,
                            extension=guessed_extension,
                            charset=charset,
                            filename=base_guess.filename,
                            local_path=base_guess.local_path,
                            url=base_guess.url,
                        )
                    )
            else:
                # There were no other guesses, so just add the base guess
                guesses.append(enhanced_guess)
        finally:
            file_stream.seek(cur_pos)
        return guesses
    def _normalize_charset(self, charset: str | None) -> str | None:
        """
        Normalize a charset string to a canonical form.
        """
        if charset is None:
            return None
        try:
            return codecs.lookup(charset).name
        except LookupError:
            return charset
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@ -19,25 +19,6 @@ ACCEPTED_FILE_CATEGORY = [
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        magic_type = (stream_info.magic_type or "").lower()
        category = (stream_info.category or "").lower()
        if category in ACCEPTED_FILE_CATEGORY:
            return True
        for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
            if magic_type.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@ -1,32 +1,9 @@
 import sys
 import io
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-
+import pdfminer.high_level
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
    import pdfminer
    import pdfminer.high_level
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "application/pdf",
    "application/x-pdf",
 ]
 ACCEPTED_FILE_EXTENSIONS = [".pdf"]
 class PdfConverter(DocumentConverter):
@ -34,45 +11,12 @@ class PdfConverter(DocumentConverter):
    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
    """
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Check the dependencies
        if _dependency_exc_info is not None:
            raise MissingDependencyException(
                MISSING_DEPENDENCY_MESSAGE.format(
                    converter=type(self).__name__,
                    extension=".pdf",
                    feature="pdf",
                )
            ) from _dependency_exc_info[
                1
            ].with_traceback(  # type: ignore[union-attr]
                _dependency_exc_info[2]
            )
        assert isinstance(file_stream, io.IOBase)  # for mypy
        return DocumentConverterResult(
            markdown=pdfminer.high_level.extract_text(file_stream),
        )
--- a/packages/markitup/src/markitup/converters/_plain_text_converter.py
+++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py
@ -1,62 +1,16 @@
 import sys
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/",
    "application/json",
    "application/markdown",
 ]
 ACCEPTED_FILE_EXTENSIONS = [
    ".txt",
    ".text",
    ".md",
    ".markdown",
    ".json",
    ".jsonl",
 ]
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        # If we have a charset, we can safely assume it's text
        # With Magika in the earlier stages, this handles most cases
        if stream_info.charset is not None:
            return True
        # Otherwise, check the mimetype and extension
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        if stream_info.charset:
            text_content = file_stream.read().decode(stream_info.charset)
        else:
        text_content = str(from_bytes(file_stream.read()).best())
        return DocumentConverterResult(markdown=text_content)
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@ -30,24 +30,6 @@ class PptxConverter(DocumentConverter):
        super().__init__()
        self._html_converter = HtmlConverter()
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        magic_type = (stream_info.magic_type or "").lower()
        category = (stream_info.category or "").lower()
        if category in ACCEPTED_FILE_CATEGORY:
            return True
        for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
            if magic_type.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
--- a/packages/markitup/tests/test_files/test.pdf
+++ b/packages/markitup/tests/test_files/test.pdf
--- a/packages/markitup/tests/test_files/test.ppt
+++ b/packages/markitup/tests/test_files/test.ppt
--- a/packages/markitup/tests/test_files/test.txt
+++ b/packages/markitup/tests/test_files/test.txt
@ -0,0 +1,4 @@
 Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
 Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
 This sample TXT file is provided by Sample-Files.com. Visit us for more sample files and resources.