supports pptx

2025-04-21 09:37:43 +00:00 · 2025-04-21 09:37:43 +00:00 · 555a849a66
commit 555a849a66
parent 615975f918
10 changed files with 277 additions and 169 deletions
--- a/packages/markitup/pyproject.toml
+++ b/packages/markitup/pyproject.toml
@ -28,10 +28,7 @@ dependencies = [
  "markdownify",
  "magika~=0.6.1",
  "charset-normalizer",
-]
-
-[project.optional-dependencies]
-all = [
+  "python-magic>=0.4.27",
  "python-pptx",
  "mammoth",
  "pandas",
@ -46,12 +43,6 @@ all = [
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
-pptx = ["python-pptx"]
-docx = ["mammoth", "lxml"]
-xlsx = ["pandas", "openpyxl"]
-xls = ["pandas", "xlrd"]
-pdf = ["pdfminer.six"]
-

 [tool.hatch.version]
 path = "src/markitup/__about__.py"
@ -60,16 +51,14 @@ path = "src/markitup/__about__.py"
 markitup = "markitup.__main__:main"

 [tool.hatch.envs.default]
-features = ["all"]
+# No features needed since everything is installed by default

 [tool.hatch.envs.hatch-test]
-features = ["all"]
 extra-dependencies = [
  "openai",
 ]

 [tool.hatch.envs.types]
-features = ["all"]
 extra-dependencies = [
  "openai",
  "mypy>=1.0.0",
@ -98,4 +87,4 @@ exclude_lines = [
 ]

 [tool.hatch.build.targets.sdist]
-only-include = ["src/markitup"]
+only-include = ["src/markitup"]
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@ -25,22 +25,12 @@ from ._uri_utils import parse_data_uri, file_uri_to_path
 from .converters import (
    PlainTextConverter,
    HtmlConverter,
-    RssConverter,
-    WikipediaConverter,
-    YouTubeConverter,
-    IpynbConverter,
-    BingSerpConverter,
    PdfConverter,
    DocxConverter,
    XlsxConverter,
    XlsConverter,
    PptxConverter,
-    ImageConverter,
-    AudioConverter,
-    OutlookMsgConverter,
-    ZipConverter,
-    EpubConverter,
-    DocumentIntelligenceConverter,
+    # AudioConverter,
    CsvConverter,
 )

--- a/packages/markitup/src/markitup/_stream_info.py
+++ b/packages/markitup/src/markitup/_stream_info.py
@ -2,31 +2,7 @@ from dataclasses import dataclass, asdict
 from typing import Optional


-@dataclass(kw_only=True, frozen=True)
+@dataclass
 class StreamInfo:
-    """The StreamInfo class is used to store information about a file stream.
-    All fields can be None, and will depend on how the stream was opened.
-    """
-
-    mimetype: Optional[str] = None
-    extension: Optional[str] = None
-    charset: Optional[str] = None
-    filename: Optional[
-        str
-    ] = None  # From local path, url, or Content-Disposition header
-    local_path: Optional[str] = None  # If read from disk
-    url: Optional[str] = None  # If read from url
-
-    def copy_and_update(self, *args, **kwargs):
-        """Copy the StreamInfo object and update it with the given StreamInfo
-        instance and/or other keyword arguments."""
-        new_info = asdict(self)
-
-        for si in args:
-            assert isinstance(si, StreamInfo)
-            new_info.update({k: v for k, v in asdict(si).items() if v is not None})
-
-        if len(kwargs) > 0:
-            new_info.update(kwargs)
-
-        return StreamInfo(**new_info)
+    magic_type: Optional[str] = None
+    category: Optional[str] = None
--- a/packages/markitup/src/markitup/converter_utils/utils.py
+++ b/packages/markitup/src/markitup/converter_utils/utils.py
@ -0,0 +1,102 @@
+import os
+from io import BytesIO
+from markitup._stream_info import StreamInfo
+import magic
+
+
+def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
+    """
+    Reads all files from the specified folder into BytesIO objects.
+
+    Args:
+        folder_path (str): Path to the folder containing files
+
+    Returns:
+        dict: Dictionary with filenames as keys and BytesIO objects as values
+    """
+    byte_streams = {}
+
+    # Check if folder exists
+    if not os.path.exists(folder_path):
+        raise FileNotFoundError(f"Folder '{folder_path}' not found")
+
+    # Iterate through all files in the folder
+    for filename in sorted(os.listdir(folder_path)):
+        file_path = os.path.join(folder_path, filename)
+
+        # Check if it's a file (not a subdirectory)
+        if os.path.isfile(file_path):
+            # Read file in binary mode
+            with open(file_path, "rb") as f:
+                # Create BytesIO object with file content
+                file_bytes = BytesIO(f.read())
+                # Add to dictionary with filename as key
+                byte_streams[filename] = file_bytes
+                # Reset BytesIO position to beginning
+                file_bytes.seek(0)
+
+    return byte_streams
+
+
+def detect_file_types(file_dict):
+    """
+    Detects file types for a dictionary of {filename: BytesIO} pairs
+    using only magic type (content-based detection)
+
+    Args:
+        file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values
+
+    Returns:
+        dict: Dictionary with filenames as keys and file type information as values
+    """
+    result = {}
+
+    for filename, byte_stream in file_dict.items():
+        # Get the original position to reset later
+        original_position = byte_stream.tell()
+
+        # Reset stream position to beginning
+        byte_stream.seek(0)
+
+        # Get file content for analysis
+        file_content = byte_stream.read()
+
+        # Use python-magic to determine file type based on content
+        magic_type = magic.from_buffer(file_content, mime=True)
+
+        # Determine file category based on magic_type
+        if magic_type.startswith("image/"):
+            category = "image"
+        elif magic_type.startswith("audio/"):
+            category = "audio"
+        elif magic_type.startswith("video/"):
+            category = "video"
+        elif (
+            magic_type.startswith("application/vnd.ms-excel")
+            or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        ):
+            category = "xls"
+        elif (
+            magic_type.startswith("application/vnd.ms-powerpoint")
+            or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+        ):
+            category = "ppt"
+        elif (
+            magic_type.startswith("application/msword")
+            or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        ):
+            category = "doc"
+        elif magic_type == "application/pdf":
+            category = "pdf"
+        elif magic_type.startswith("text/"):
+            category = "text"
+        else:
+            category = "other"
+
+        # Store the results
+        result[filename] = StreamInfo(magic_type=magic_type, category=category)
+
+        # Reset stream position
+        byte_stream.seek(original_position)
+
+    return result
--- a/packages/markitup/src/markitup/converters/init.py
+++ b/packages/markitup/src/markitup/converters/init.py
@ -8,13 +8,15 @@ from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
-from ._audio_converter import AudioConverter
+# from ._audio_converter import AudioConverter
 from ._csv_converter import CsvConverter
+from ._markdownify import _CustomMarkdownify

 __all__ = [
    "PlainTextConverter",
    "HtmlConverter",
    "RssConverter",
+    "_CustomMarkdownify",
    "WikipediaConverter",
    "YouTubeConverter",
    "IpynbConverter",
@ -25,7 +27,7 @@ __all__ = [
    "XlsConverter",
    "PptxConverter",
    "ImageConverter",
-    "AudioConverter",
+    # "AudioConverter",
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
--- a/packages/markitup/src/markitup/converters/_audio_converter.py
+++ b/packages/markitup/src/markitup/converters/_audio_converter.py
@ -2,7 +2,6 @@ import io
 from typing import Any, BinaryIO, Optional

 from ._exiftool import exiftool_metadata
-from ._transcribe_audio import transcribe_audio
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@ -6,12 +6,12 @@ from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify

-ACCEPTED_MIME_TYPE_PREFIXES = [
+ACCEPTED_MAGIC_TYPE_PREFIXES = [
    "text/html",
    "application/xhtml",
 ]

-ACCEPTED_FILE_EXTENSIONS = [
+ACCEPTED_FILE_CATEGORY = [
    ".html",
    ".htm",
 ]
@ -26,14 +26,14 @@ class HtmlConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
+        magic_type = (stream_info.magic_type or "").lower()
+        category = (stream_info.category or "").lower()

-        if extension in ACCEPTED_FILE_EXTENSIONS:
+        if category in ACCEPTED_FILE_CATEGORY:
            return True

-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
+        for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
+            if magic_type.startswith(prefix):
                return True

        return False
@ -45,7 +45,7 @@ class HtmlConverter(DocumentConverter):
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Parse the stream
-        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
+        encoding = "utf-8"
        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Remove javascript and style blocks
@ -81,10 +81,8 @@ class HtmlConverter(DocumentConverter):
        return self.convert(
            file_stream=io.BytesIO(html_content.encode("utf-8")),
            stream_info=StreamInfo(
-                mimetype="text/html",
-                extension=".html",
-                charset="utf-8",
-                url=url,
+                magic_type="text/html",
+                category="text",
            ),
            **kwargs,
        )
--- a/packages/markitup/src/markitup/converters/_markdownify.py
+++ b/packages/markitup/src/markitup/converters/_markdownify.py
@ -0,0 +1,111 @@
+import re
+import markdownify
+
+from typing import Any, Optional
+from urllib.parse import quote, unquote, urlparse, urlunparse
+
+
+class _CustomMarkdownify(markdownify.MarkdownConverter):
+    """
+    A custom version of markdownify's MarkdownConverter. Changes include:
+
+    - Altering the default heading style to use '#', '##', etc.
+    - Removing javascript hyperlinks.
+    - Truncating images with large data:uri sources.
+    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
+    """
+
+    def __init__(self, **options: Any):
+        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+        options["keep_data_uris"] = options.get("keep_data_uris", False)
+        # Explicitly cast options to the expected type if necessary
+        super().__init__(**options)
+
+    def convert_hn(
+        self,
+        n: int,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ) -> str:
+        """Same as usual, but be sure to start with a new line"""
+        if not convert_as_inline:
+            if not re.search(r"^\n", text):
+                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+
+        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+
+    def convert_a(
+        self,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ):
+        """Same as usual converter, but removes Javascript links and escapes URIs."""
+        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
+        if not text:
+            return ""
+
+        if el.find_parent("pre") is not None:
+            return text
+
+        href = el.get("href")
+        title = el.get("title")
+
+        # Escape URIs and skip non-http or file schemes
+        if href:
+            try:
+                parsed_url = urlparse(href)  # type: ignore
+                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
+                    return "%s%s%s" % (prefix, text, suffix)
+                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
+            except ValueError:  # It's not clear if this ever gets thrown
+                return "%s%s%s" % (prefix, text, suffix)
+
+        # For the replacement see #29: text nodes underscores are escaped
+        if (
+            self.options["autolinks"]
+            and text.replace(r"\_", "_") == href
+            and not title
+            and not self.options["default_title"]
+        ):
+            # Shortcut syntax
+            return "<%s>" % href
+        if self.options["default_title"] and not title:
+            title = href
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        return (
+            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
+            if href
+            else text
+        )
+
+    def convert_img(
+        self,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ) -> str:
+        """Same as usual converter, but removes data URIs"""
+
+        alt = el.attrs.get("alt", None) or ""
+        src = el.attrs.get("src", None) or ""
+        title = el.attrs.get("title", None) or ""
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        if (
+            convert_as_inline
+            and el.parent.name not in self.options["keep_inline_images_in"]
+        ):
+            return alt
+
+        # Remove dataURIs
+        if src.startswith("data:") and not self.options["keep_data_uris"]:
+            src = src.split(",")[0] + "..."
+
+        return "![%s](%s%s)" % (alt, src, title_part)
+
+    def convert_soup(self, soup: Any) -> str:
+        return super().convert_soup(soup)  # type: ignore
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@ -11,23 +11,14 @@ from operator import attrgetter
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-
-# Try loading optional (but in this case, required) dependencies
-# Save reporting of any exceptions for later
-_dependency_exc_info = None
-try:
-    import pptx
-except ImportError:
-    # Preserve the error and stack trace for later
-    _dependency_exc_info = sys.exc_info()
+import pptx


-ACCEPTED_MIME_TYPE_PREFIXES = [
+ACCEPTED_MAGIC_TYPE_PREFIXES = [
    "application/vnd.openxmlformats-officedocument.presentationml",
 ]

-ACCEPTED_FILE_EXTENSIONS = [".pptx"]
+ACCEPTED_FILE_CATEGORY = [".pptx"]


 class PptxConverter(DocumentConverter):
@ -45,14 +36,14 @@ class PptxConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
+        magic_type = (stream_info.magic_type or "").lower()
+        category = (stream_info.category or "").lower()

-        if extension in ACCEPTED_FILE_EXTENSIONS:
+        if category in ACCEPTED_FILE_CATEGORY:
            return True

-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
+        for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
+            if magic_type.startswith(prefix):
                return True

        return False
@ -63,19 +54,6 @@ class PptxConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        # Check the dependencies
-        if _dependency_exc_info is not None:
-            raise MissingDependencyException(
-                MISSING_DEPENDENCY_MESSAGE.format(
-                    converter=type(self).__name__,
-                    extension=".pptx",
-                    feature="pptx",
-                )
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
-                _dependency_exc_info[2]
-            )

        # Perform the conversion
        presentation = pptx.Presentation(file_stream)
@ -109,15 +87,12 @@ class PptxConverter(DocumentConverter):
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()

                    # If keep_data_uris is True, use base64 encoding for images
-                    if kwargs.get("keep_data_uris", False):
-                        blob = shape.image.blob
-                        content_type = shape.image.content_type or "image/png"
-                        b64_string = base64.b64encode(blob).decode("utf-8")
-                        md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
-                    else:
-                        # A placeholder name
-                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                        md_content += "\n![" + alt_text + "](" + filename + ")\n"
+
+                    blob = shape.image.blob
+                    content_type = shape.image.content_type or "image/png"
+                    b64_string = base64.b64encode(blob).decode("utf-8")
+                    md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
+

                # Tables
                if self._is_table(shape):
--- a/packages/markitup/uv.lock
+++ b/packages/markitup/uv.lock
@ -505,92 +505,49 @@ wheels = [
 name = "markitup"
 source = { editable = "." }
 dependencies = [
-    { name = "beautifulsoup4" },
-    { name = "charset-normalizer" },
-    { name = "magika" },
-    { name = "markdownify" },
-    { name = "requests" },
-]
-
-[package.optional-dependencies]
-all = [
    { name = "azure-ai-documentintelligence" },
    { name = "azure-identity" },
+    { name = "beautifulsoup4" },
+    { name = "charset-normalizer" },
    { name = "lxml" },
+    { name = "magika" },
    { name = "mammoth" },
+    { name = "markdownify" },
    { name = "olefile" },
    { name = "openpyxl" },
    { name = "pandas" },
    { name = "pdfminer-six" },
    { name = "pydub" },
+    { name = "python-magic" },
    { name = "python-pptx" },
+    { name = "requests" },
    { name = "speechrecognition" },
    { name = "xlrd" },
    { name = "youtube-transcript-api" },
 ]
-audio-transcription = [
-    { name = "pydub" },
-    { name = "speechrecognition" },
-]
-docx = [
-    { name = "lxml" },
-    { name = "mammoth" },
-]
-outlook = [
-    { name = "olefile" },
-]
-pdf = [
-    { name = "pdfminer-six" },
-]
-pptx = [
-    { name = "python-pptx" },
-]
-xls = [
-    { name = "pandas" },
-    { name = "xlrd" },
-]
-xlsx = [
-    { name = "openpyxl" },
-    { name = "pandas" },
-]
-youtube-transcription = [
-    { name = "youtube-transcript-api" },
-]

 [package.metadata]
 requires-dist = [
-    { name = "azure-ai-documentintelligence", marker = "extra == 'all'" },
-    { name = "azure-identity", marker = "extra == 'all'" },
+    { name = "azure-ai-documentintelligence" },
+    { name = "azure-identity" },
    { name = "beautifulsoup4" },
    { name = "charset-normalizer" },
-    { name = "lxml", marker = "extra == 'all'" },
-    { name = "lxml", marker = "extra == 'docx'" },
+    { name = "lxml" },
    { name = "magika", specifier = "~=0.6.1" },
-    { name = "mammoth", marker = "extra == 'all'" },
-    { name = "mammoth", marker = "extra == 'docx'" },
+    { name = "mammoth" },
    { name = "markdownify" },
-    { name = "olefile", marker = "extra == 'all'" },
-    { name = "olefile", marker = "extra == 'outlook'" },
-    { name = "openpyxl", marker = "extra == 'all'" },
-    { name = "openpyxl", marker = "extra == 'xlsx'" },
-    { name = "pandas", marker = "extra == 'all'" },
-    { name = "pandas", marker = "extra == 'xls'" },
-    { name = "pandas", marker = "extra == 'xlsx'" },
-    { name = "pdfminer-six", marker = "extra == 'all'" },
-    { name = "pdfminer-six", marker = "extra == 'pdf'" },
-    { name = "pydub", marker = "extra == 'all'" },
-    { name = "pydub", marker = "extra == 'audio-transcription'" },
-    { name = "python-pptx", marker = "extra == 'all'" },
-    { name = "python-pptx", marker = "extra == 'pptx'" },
+    { name = "olefile" },
+    { name = "openpyxl" },
+    { name = "pandas" },
+    { name = "pdfminer-six" },
+    { name = "pydub" },
+    { name = "python-magic", specifier = ">=0.4.27" },
+    { name = "python-pptx" },
    { name = "requests" },
-    { name = "speechrecognition", marker = "extra == 'all'" },
-    { name = "speechrecognition", marker = "extra == 'audio-transcription'" },
-    { name = "xlrd", marker = "extra == 'all'" },
-    { name = "xlrd", marker = "extra == 'xls'" },
-    { name = "youtube-transcript-api", marker = "extra == 'all'", specifier = "~=1.0.0" },
-    { name = "youtube-transcript-api", marker = "extra == 'youtube-transcription'" },
+    { name = "speechrecognition" },
+    { name = "xlrd" },
+    { name = "youtube-transcript-api", specifier = "~=1.0.0" },
 ]
-provides-extras = ["all", "audio-transcription", "docx", "outlook", "pdf", "pptx", "xls", "xlsx", "youtube-transcription"]

 [[package]]
 name = "mpmath"
@ -966,6 +923,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
 ]

+[[package]]
+name = "python-magic"
+version = "0.4.27"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/da/db/0b3e28ac047452d079d375ec6798bf76a036a08182dbb39ed38116a49130/python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b", size = 14677 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3", size = 13840 },
+]
+
 [[package]]
 name = "python-pptx"
 version = "1.0.2"