added image in html converter

2025-04-22 11:26:07 +00:00 · 2025-04-22 11:26:07 +00:00 · e521dbcf2d
commit e521dbcf2d
parent 4519f9230c
10 changed files with 105 additions and 177 deletions
--- a/packages/markitup/src/markitup/_markitup.py
+++ b/packages/markitup/src/markitup/_markitup.py
@ -40,14 +40,6 @@ class MarkItUp:
    def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
        stream_info: StreamInfo = self._get_stream_info(stream)
        # Deal with unsupported file types
-        match stream_info.category:
-            case "ppt":
-                raise UnsupportedFormatException(
-                    ".ppt files are not supported, try .pptx instead")
-            case "other":
-                raise UnsupportedFormatException(
-                    f"{stream_info.magic_type} files are not supported")
-
        try:
            match stream_info.category:
                case "text":
@ -59,13 +51,24 @@ class MarkItUp:
                case "audio":
                    return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
                case "xlsx":
-                    return XlsxConverter().convert(stream, stream_info), stream_info
+                    return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info
                case "xls":
-                    return XlsConverter().convert(stream, stream_info), stream_info
+                    return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
+                case "csv":
+                    return CsvConverter().convert(stream, stream_info), stream_info
+                case "docx":
+                    return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
+                case _:
+                    match stream_info.category:
+                        case "ppt":
+                            raise UnsupportedFormatException(
+                                ".ppt files are not supported, try .pptx instead")
+                        case "other":
+                            raise UnsupportedFormatException(
+                                f"{stream_info.magic_type} files are not supported")
        except FailedConversionAttempt:
            raise FileConversionException(
                f"Failed to convert file of type {stream_info.magic_type}")
-        return stream_info

    def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
        original_position = byte_stream.tell()
@ -100,8 +103,13 @@ class MarkItUp:
            category = "docx"
        elif magic_type == "application/pdf":
            category = "pdf"
+        elif magic_type == "application/csv":
+            category = "csv"
        elif magic_type.startswith("text/"):
-            category = "text"
+            if magic_type == "text/csv":
+                category = "csv"
+            else:
+                category = "text"
        else:
            category = "other"

--- a/packages/markitup/src/markitup/converter_utils/utils.py
+++ b/packages/markitup/src/markitup/converter_utils/utils.py
@ -42,64 +42,6 @@ def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
    return byte_streams


-def detect_file_types(file_dict):
-    """
-    Detects file types for a dictionary of {filename: BytesIO} pairs
-    using only magic type (content-based detection)
-
-    Args:
-        file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values
-
-    Returns:
-        dict: Dictionary with filenames as keys and file type information as values
-    """
-    result = {}
-
-    for filename, byte_stream in file_dict.items():
-        # Get the original position to reset later
-        original_position = byte_stream.tell()
-
-        # Reset stream position to beginning
-        byte_stream.seek(0)
-
-        # Get file content for analysis
-        file_content = byte_stream.read()
-
-        # Use python-magic to determine file type based on content
-        magic_type = magic.from_buffer(file_content, mime=True)
-
-        # Determine file category based on magic_type
-        if magic_type.startswith("image/"):
-            category = "image"
-        elif magic_type.startswith("audio/"):
-            category = "audio"
-        elif magic_type.startswith("video/"):
-            category = "video"
-        elif magic_type.startswith("application/vnd.ms-excel"):
-            category = 'xls'
-        elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
-            category = "xlsx"
-        elif magic_type.startswith("application/vnd.ms-powerpoint"):
-            category = 'ppt'
-        elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
-            category = "pptx"
-        elif magic_type.startswith("application/msword"):
-            category = 'doc'
-        elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-            category = "docx"
-        elif magic_type == "application/pdf":
-            category = "pdf"
-        elif magic_type.startswith("text/"):
-            category = "text"
-        else:
-            category = "other"
-
-        byte_stream.seek(original_position)
-        result[filename] = StreamInfo(magic_type=magic_type, category=category)
-
-    return result
-
-
 def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
    audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None

--- a/packages/markitup/src/markitup/converters/_csv_converter.py
+++ b/packages/markitup/src/markitup/converters/_csv_converter.py
@ -3,40 +3,15 @@ import csv
 import io
 from typing import BinaryIO, Any
 from charset_normalizer import from_bytes
-from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._schemas import StreamInfo

-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "text/csv",
-    "application/csv",
-]
-ACCEPTED_FILE_EXTENSIONS = [".csv"]
-

 class CsvConverter(DocumentConverter):
    """
    Converts CSV files to Markdown tables.
    """

-    def __init__(self):
-        super().__init__()
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-        return False
-
    def convert(
        self,
        file_stream: BinaryIO,
@ -44,10 +19,7 @@ class CsvConverter(DocumentConverter):
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        # Read the file content
-        if stream_info.charset:
-            content = file_stream.read().decode(stream_info.charset)
-        else:
-            content = str(from_bytes(file_stream.read()).best())
+        content = str(from_bytes(file_stream.read()).best())

        # Parse CSV content
        reader = csv.reader(io.StringIO(content))
--- a/packages/markitup/src/markitup/converters/_docx_converter.py
+++ b/packages/markitup/src/markitup/converters/_docx_converter.py
@ -5,24 +5,8 @@ from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from ..converter_utils.docx.pre_process import pre_process_docx
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._schemas import StreamInfo
-from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
-
-# Try loading optional (but in this case, required) dependencies
-# Save reporting of any exceptions for later
-_dependency_exc_info = None
-try:
-    import mammoth
-except ImportError:
-    # Preserve the error and stack trace for later
-    _dependency_exc_info = sys.exc_info()
-
-
-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-]
-
-ACCEPTED_FILE_EXTENSIONS = [".docx"]
+from .._schemas import StreamInfo, Config
+import mammoth


 class DocxConverter(HtmlConverter):
@ -30,27 +14,8 @@ class DocxConverter(HtmlConverter):
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
    """

-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+    def __init__(self, config: Config):
+        self._html_converter = HtmlConverter(config=config)

    def convert(
        self,
@ -58,19 +23,6 @@ class DocxConverter(HtmlConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        # Check: the dependencies
-        if _dependency_exc_info is not None:
-            raise MissingDependencyException(
-                MISSING_DEPENDENCY_MESSAGE.format(
-                    converter=type(self).__name__,
-                    extension=".docx",
-                    feature="docx",
-                )
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
-                _dependency_exc_info[2]
-            )

        style_map = kwargs.get("style_map", None)
        pre_process_stream = pre_process_docx(file_stream)
--- a/packages/markitup/src/markitup/converters/_html_converter.py
+++ b/packages/markitup/src/markitup/converters/_html_converter.py
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._schemas import StreamInfo
+from .._schemas import StreamInfo, Config
 from ._markdownify import _CustomMarkdownify

 ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -20,6 +20,9 @@ ACCEPTED_FILE_CATEGORY = [
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""

+    def __init__(self, config: Config):
+        self.config = config
+
    def convert(
        self,
        file_stream: BinaryIO,
@ -39,15 +42,17 @@ class HtmlConverter(DocumentConverter):
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify(
+                config=self.config, **kwargs).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+            webpage_text = _CustomMarkdownify(
+                config=self.config, **kwargs).convert_soup(soup)

        assert isinstance(webpage_text, str)

        # remove leading and trailing \n
        webpage_text = webpage_text.strip()
-
+        print(webpage_text)
        return DocumentConverterResult(
            markdown=webpage_text,
            title=None if soup.title is None else soup.title.string,
--- a/packages/markitup/src/markitup/converters/_markdownify.py
+++ b/packages/markitup/src/markitup/converters/_markdownify.py
@ -3,6 +3,7 @@ import markdownify

 from typing import Any, Optional
 from urllib.parse import quote, unquote, urlparse, urlunparse
+from .._schemas import Config


 class _CustomMarkdownify(markdownify.MarkdownConverter):
@ -15,11 +16,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
    """

-    def __init__(self, **options: Any):
-        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+    def __init__(self, config: Config, **options: Any):
+        options["heading_style"] = options.get(
+            "heading_style", markdownify.ATX)
        options["keep_data_uris"] = options.get("keep_data_uris", False)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
+        self.config = config

    def convert_hn(
        self,
@ -58,9 +61,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        if href:
            try:
                parsed_url = urlparse(href)  # type: ignore
-                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
+                # type: ignore
+                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:
                    return "%s%s%s" % (prefix, text, suffix)
-                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
+                href = urlunparse(parsed_url._replace(
+                    path=quote(unquote(parsed_url.path))))  # type: ignore
            except ValueError:  # It's not clear if this ever gets thrown
                return "%s%s%s" % (prefix, text, suffix)

@ -95,17 +100,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        src = el.attrs.get("src", None) or ""
        title = el.attrs.get("title", None) or ""
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
-        if (
-            convert_as_inline
-            and el.parent.name not in self.options["keep_inline_images_in"]
-        ):
+
+        if "image" in self.config.modalities:
+            return "![%s](%s%s)" % (alt, src, title_part)
+        else:
            return alt

-        # Remove dataURIs
-        if src.startswith("data:") and not self.options["keep_data_uris"]:
-            src = src.split(",")[0] + "..."
-
-        return "![%s](%s%s)" % (alt, src, title_part)
-
    def convert_soup(self, soup: Any) -> str:
-        return super().convert_soup(soup)  # type: ignore
+        return super().convert_soup(soup)  # type: ignore
--- a/packages/markitup/src/markitup/converters/_pptx_converter.py
+++ b/packages/markitup/src/markitup/converters/_pptx_converter.py
@ -27,7 +27,7 @@ class PptxConverter(DocumentConverter):
    """

    def __init__(self, config: Config):
-        self._html_converter = HtmlConverter()
+        self._html_converter = HtmlConverter(config=config)
        self.config = config

    def convert(
--- a/packages/markitup/src/markitup/converters/_xlsx_converter.py
+++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py
@ -1,7 +1,7 @@
 from typing import BinaryIO, Any
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._schemas import StreamInfo
+from .._schemas import StreamInfo, Config
 import pandas as pd


@ -10,8 +10,8 @@ class XlsxConverter(DocumentConverter):
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def __init__(self):
-        self._html_converter = HtmlConverter()
+    def __init__(self, config: Config):
+        self._html_converter = HtmlConverter(config=config)

    def convert(
        self,
@ -39,9 +39,8 @@ class XlsConverter(DocumentConverter):
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """

-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
+    def __init__(self, config: Config):
+        self._html_converter = HtmlConverter(config=config)

    def convert(
        self,
--- a/packages/markitup/tests/test_files/test.csv
+++ b/packages/markitup/tests/test_files/test.csv
@ -0,0 +1,51 @@
+ID,Name,Age,Country,Email
+1,Name_1,62,Country_1,email_1@example.com
+2,Name_2,48,Country_2,email_2@example.com
+3,Name_3,61,Country_3,email_3@example.com
+4,Name_4,32,Country_4,email_4@example.com
+5,Name_5,69,Country_5,email_5@example.com
+6,Name_6,32,Country_6,email_6@example.com
+7,Name_7,62,Country_7,email_7@example.com
+8,Name_8,39,Country_8,email_8@example.com
+9,Name_9,40,Country_9,email_9@example.com
+10,Name_10,32,Country_0,email_10@example.com
+11,Name_11,24,Country_1,email_11@example.com
+12,Name_12,45,Country_2,email_12@example.com
+13,Name_13,39,Country_3,email_13@example.com
+14,Name_14,18,Country_4,email_14@example.com
+15,Name_15,66,Country_5,email_15@example.com
+16,Name_16,48,Country_6,email_16@example.com
+17,Name_17,60,Country_7,email_17@example.com
+18,Name_18,31,Country_8,email_18@example.com
+19,Name_19,43,Country_9,email_19@example.com
+20,Name_20,33,Country_0,email_20@example.com
+21,Name_21,32,Country_1,email_21@example.com
+22,Name_22,68,Country_2,email_22@example.com
+23,Name_23,44,Country_3,email_23@example.com
+24,Name_24,32,Country_4,email_24@example.com
+25,Name_25,33,Country_5,email_25@example.com
+26,Name_26,46,Country_6,email_26@example.com
+27,Name_27,38,Country_7,email_27@example.com
+28,Name_28,50,Country_8,email_28@example.com
+29,Name_29,68,Country_9,email_29@example.com
+30,Name_30,66,Country_0,email_30@example.com
+31,Name_31,60,Country_1,email_31@example.com
+32,Name_32,53,Country_2,email_32@example.com
+33,Name_33,30,Country_3,email_33@example.com
+34,Name_34,30,Country_4,email_34@example.com
+35,Name_35,43,Country_5,email_35@example.com
+36,Name_36,44,Country_6,email_36@example.com
+37,Name_37,31,Country_7,email_37@example.com
+38,Name_38,35,Country_8,email_38@example.com
+39,Name_39,56,Country_9,email_39@example.com
+40,Name_40,35,Country_0,email_40@example.com
+41,Name_41,62,Country_1,email_41@example.com
+42,Name_42,63,Country_2,email_42@example.com
+43,Name_43,51,Country_3,email_43@example.com
+44,Name_44,52,Country_4,email_44@example.com
+45,Name_45,66,Country_5,email_45@example.com
+46,Name_46,69,Country_6,email_46@example.com
+47,Name_47,68,Country_7,email_47@example.com
+48,Name_48,68,Country_8,email_48@example.com
+49,Name_49,69,Country_9,email_49@example.com
+50,Name_50,46,Country_0,email_50@example.com
--- a/packages/markitup/tests/test_files/test.docx
+++ b/packages/markitup/tests/test_files/test.docx