modality

2025-04-22 07:00:30 +00:00 · 2025-04-22 07:00:30 +00:00 · 03f3fa9829
commit 03f3fa9829
parent e729da2b38
14 changed files with 66 additions and 47 deletions
--- a/packages/markitup/src/markitup/init.py
+++ b/packages/markitup/src/markitup/init.py
@ -7,7 +7,7 @@ from ._markitup import (
    MarkItUp,
 )
 from ._base_converter import DocumentConverterResult, DocumentConverter
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config
 from ._exceptions import (
    MarkItUpException,
    MissingDependencyException,
@ -27,4 +27,5 @@ __all__ = [
    "FileConversionException",
    "UnsupportedFormatException",
    "StreamInfo",
    "Config"
 ]
--- a/packages/markitup/src/markitup/_base_converter.py
+++ b/packages/markitup/src/markitup/_base_converter.py
@ -2,7 +2,7 @@ import os
 import tempfile
 from warnings import warn
 from typing import Any, Union, BinaryIO, Optional, List, Dict
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo
 import re
@ -40,7 +40,6 @@ class DocumentConverterResult:
                                (text and images) in their original order.
        """
        # Pattern to match markdown image syntax with base64 data
        pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from warnings import warn
 import magic
-from ._stream_info import StreamInfo
+from ._schemas import StreamInfo, Config
 from .converters import (
    PlainTextConverter,
@ -33,7 +33,7 @@ class MarkItUp:
    def __init__(
        self,
-        config: Optional[Dict[str, Any]] = None,
+        config: Config = Config(),
    ):
        self.config = config
@ -42,9 +42,11 @@ class MarkItUp:
        # Deal with unsupported file types
        match stream_info.category:
            case "ppt":
-                raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
+                raise UnsupportedFormatException(
                    ".ppt files are not supported, try .pptx instead")
            case "other":
-                raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
+                raise UnsupportedFormatException(
                    f"{stream_info.magic_type} files are not supported")
        try:
            match stream_info.category:
@ -55,7 +57,8 @@ class MarkItUp:
                case "pdf":
                    return PdfConverter().convert(stream, stream_info), stream_info
        except FailedConversionAttempt:
-            raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
+            raise FileConversionException(
                f"Failed to convert file of type {stream_info.magic_type}")
        return stream_info
    def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
--- a/packages/markitup/src/markitup/_schemas.py
+++ b/packages/markitup/src/markitup/_schemas.py
@ -0,0 +1,15 @@
 from dataclasses import dataclass, asdict, field
 from typing import Optional, List, Literal
@dataclass
 class StreamInfo:
    magic_type: Optional[str] = None
    category: Optional[str] = None
@dataclass
 class Config:
    modality: List[Literal["image", "audio"]] = field(
        default_factory=lambda: ["image", "audio"]
    )
--- a/packages/markitup/src/markitup/_stream_info.py
+++ b/packages/markitup/src/markitup/_stream_info.py
@ -1,8 +0,0 @@
 from dataclasses import dataclass, asdict
 from typing import Optional
@dataclass
 class StreamInfo:
    magic_type: Optional[str] = None
    category: Optional[str] = None
--- a/packages/markitup/src/markitup/converter_utils/utils.py
+++ b/packages/markitup/src/markitup/converter_utils/utils.py
@ -1,6 +1,6 @@
 import os
 from io import BytesIO
-from markitup._stream_info import StreamInfo
+from markitup._schemas import StreamInfo
 import magic
--- a/packages/markitup/src/markitup/converters/_audio_converter.py
+++ b/packages/markitup/src/markitup/converters/_audio_converter.py
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
 from ._exiftool import exiftool_metadata
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from .._exceptions import MissingDependencyException
 ACCEPTED_MIME_TYPE_PREFIXES = [
@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
        # Transcribe
        if audio_format:
            try:
-                transcript = transcribe_audio(file_stream, audio_format=audio_format)
+                transcript = transcribe_audio(
                    file_stream, audio_format=audio_format)
                if transcript:
                    md_content += "\n\n### Audio Transcript:\n" + transcript
            except MissingDependencyException:
--- a/packages/markitup/src/markitup/converters/_csv_converter.py
+++ b/packages/markitup/src/markitup/converters/_csv_converter.py
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/csv",
--- a/packages/markitup/src/markitup/converters/_docx_converter.py
+++ b/packages/markitup/src/markitup/converters/_docx_converter.py
@ -5,7 +5,7 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Try loading optional (but in this case, required) dependencies
@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
        style_map = kwargs.get("style_map", None)
        pre_process_stream = pre_process_docx(file_stream)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
+            mammoth.convert_to_html(
                pre_process_stream, style_map=style_map).value,
            **kwargs,
        )
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 from ._markdownify import _CustomMarkdownify
 ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
    def convert(
        self,
        file_stream: BinaryIO,
@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8"
-        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser",
                             from_encoding=encoding)
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
--- a/packages/markitup/src/markitup/converters/_pdf_converter.py
+++ b/packages/markitup/src/markitup/converters/_pdf_converter.py
@ -3,7 +3,7 @@ import io
 import base64
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 import fitz
@ -47,7 +47,8 @@ class PdfConverter(DocumentConverter):
                    try:
                        # Convert image to base64 for markdown embedding
-                        img_base64 = base64.b64encode(image_bytes).decode('utf-8')
+                        img_base64 = base64.b64encode(
                            image_bytes).decode('utf-8')
                        # Add image to markdown with a unique identifier
                        image_count += 1
                        markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
--- a/packages/markitup/src/markitup/converters/_plain_text_converter.py
+++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py
@ -1,11 +1,12 @@
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
    def convert(
        self,
        file_stream: BinaryIO,
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@ -10,7 +10,7 @@ from operator import attrgetter
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 import pptx
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
                    # Also grab any description embedded in the deck
                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
+                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
                            "descr", "")
                    except Exception:
                        # Unable to get alt text
                        pass
@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
                    b64_string = base64.b64encode(blob).decode("utf-8")
                    md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)
+                    md_content += self._convert_table_to_markdown(
                        shape.table, **kwargs)
                # Charts
                if shape.has_chart:
@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):
                # Group Shapes
                if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
-                    sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
+                    sorted_shapes = sorted(
                        shape.shapes, key=attrgetter("top", "left"))
                    for subshape in sorted_shapes:
                        get_shape_content(subshape, **kwargs)
@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
        html_table += "</table></body></html>"
        return (
-            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+            self._html_converter.convert_string(
                html_table, **kwargs).markdown.strip()
            + "\n"
        )
--- a/packages/markitup/src/markitup/converters/_xlsx_converter.py
+++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py
@ -3,7 +3,7 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-from .._stream_info import StreamInfo
+from .._schemas import StreamInfo
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later