diff --git a/packages/markitup/pyproject.toml b/packages/markitup/pyproject.toml index cc2034f..5bcd9af 100644 --- a/packages/markitup/pyproject.toml +++ b/packages/markitup/pyproject.toml @@ -32,8 +32,6 @@ dependencies = [ "python-pptx", "mammoth", "pandas", - "openpyxl", - "xlrd", "lxml", "olefile", "pydub", diff --git a/packages/markitup/src/markitup/__init__.py b/packages/markitup/src/markitup/__init__.py index aef329a..b06156d 100644 --- a/packages/markitup/src/markitup/__init__.py +++ b/packages/markitup/src/markitup/__init__.py @@ -7,9 +7,8 @@ from ._markitup import ( MarkItUp, ) from ._base_converter import DocumentConverterResult, DocumentConverter -from ._stream_info import StreamInfo +from ._schemas import StreamInfo, Config from ._exceptions import ( - MarkItUpException, MissingDependencyException, FailedConversionAttempt, FileConversionException, @@ -21,10 +20,10 @@ __all__ = [ "MarkItUp", "DocumentConverter", "DocumentConverterResult", - "MarkItUpException", "MissingDependencyException", "FailedConversionAttempt", "FileConversionException", "UnsupportedFormatException", "StreamInfo", + "Config" ] diff --git a/packages/markitup/src/markitup/_base_converter.py b/packages/markitup/src/markitup/_base_converter.py index cdabef9..0d5300d 100644 --- a/packages/markitup/src/markitup/_base_converter.py +++ b/packages/markitup/src/markitup/_base_converter.py @@ -2,8 +2,9 @@ import os import tempfile from warnings import warn from typing import Any, Union, BinaryIO, Optional, List, Dict -from ._stream_info import StreamInfo +from ._schemas import StreamInfo import re +import base64 class DocumentConverterResult: @@ -11,9 +12,11 @@ class DocumentConverterResult: def __init__( self, - markdown: str, + markdown: str = "", *, title: Optional[str] = None, + audio_stream: Optional[BinaryIO] = None, + stream_info: Optional[StreamInfo] = None, ): """ Initialize the DocumentConverterResult. @@ -26,20 +29,21 @@ class DocumentConverterResult: - title: Optional title of the document. """ self.markdown = markdown + self.audio_stream = audio_stream self.title = title - + self.stream_info = stream_info + def to_llm(self) -> List[Dict[str, Any]]: """ Convert markdown with base64 images to a format compatible with OpenAI's API. - + This function parses the markdown content, extracting text and images in their original order, and returns a list of content elements in OpenAI's format. - + Returns: List[Dict[str, Any]]: A list of dictionaries representing the content elements (text and images) in their original order. """ - # Pattern to match markdown image syntax with base64 data pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)' @@ -80,7 +84,14 @@ class DocumentConverterResult: "type": "text", "text": text_chunk }) - + if self.audio_stream: + audio_b64 = base64.b64encode( + self.audio_stream.read()).decode('utf-8') + content.append({ + "type": "media", + "mime_type": self.stream_info.magic_type, + "data": audio_b64 + }) return content @property @@ -105,7 +116,7 @@ class DocumentConverter: self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + ** kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: """ Convert a document to Markdown text. diff --git a/packages/markitup/src/markitup/_exceptions.py b/packages/markitup/src/markitup/_exceptions.py index fca098f..b3a478e 100644 --- a/packages/markitup/src/markitup/_exceptions.py +++ b/packages/markitup/src/markitup/_exceptions.py @@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential * etc.""" -class MarkItUpException(Exception): - """ - Base exception class for MarkItUp. - """ - - pass - - -class MissingDependencyException(MarkItUpException): +class MissingDependencyException(Exception): """ Converters shipped with MarkItUp may depend on optional dependencies. This exception is thrown when a converter's @@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException): pass -class UnsupportedFormatException(MarkItUpException): +class UnsupportedFormatException(Exception): """ Thrown when no suitable converter was found for the given file. """ @@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException): pass -class FailedConversionAttempt(object): +class FailedConversionAttempt(Exception): """ - Represents an a single attempt to convert a file. + Represents a single attempt to convert a file. """ - def __init__(self, converter: Any, exc_info: Optional[tuple] = None): - self.converter = converter - self.exc_info = exc_info + def __init__(self): + super().__init__(f"Conversion attempt failed!") -class FileConversionException(MarkItUpException): +class FileConversionException(Exception): """ Thrown when a suitable converter was found, but the conversion process fails for any reason. diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index c2fb0a2..a0b2186 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -4,7 +4,7 @@ from urllib.parse import urlparse from warnings import warn import magic -from ._stream_info import StreamInfo +from ._schemas import StreamInfo, Config from .converters import ( PlainTextConverter, @@ -14,7 +14,7 @@ from .converters import ( XlsxConverter, XlsConverter, PptxConverter, - # AudioConverter, + AudioConverter, CsvConverter, ) @@ -33,30 +33,42 @@ class MarkItUp: def __init__( self, - config: Optional[Dict[str, Any]] = None, + config: Config = Config(), ): self.config = config def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]: stream_info: StreamInfo = self._get_stream_info(stream) # Deal with unsupported file types - match stream_info.category: - case "ppt": - raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead") - case "other": - raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported") - try: match stream_info.category: case "text": return PlainTextConverter().convert(stream, stream_info), stream_info case "pptx": - return PptxConverter().convert(stream, stream_info), stream_info + return PptxConverter(config=self.config).convert(stream, stream_info), stream_info case "pdf": - return PdfConverter().convert(stream, stream_info), stream_info + return PdfConverter(config=self.config).convert(stream, stream_info), stream_info + case "audio": + return AudioConverter(config=self.config).convert(stream, stream_info), stream_info + case "xlsx": + return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info + case "xls": + return XlsConverter(config=self.config).convert(stream, stream_info), stream_info + case "csv": + return CsvConverter().convert(stream, stream_info), stream_info + case "docx": + return DocxConverter(config=self.config).convert(stream, stream_info), stream_info + case _: + match stream_info.category: + case "ppt": + raise UnsupportedFormatException( + ".ppt files are not supported, try .pptx instead") + case "other": + raise UnsupportedFormatException( + f"{stream_info.magic_type} files are not supported") except FailedConversionAttempt: - raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}") - return stream_info + raise FileConversionException( + f"Failed to convert file of type {stream_info.magic_type}") def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo: original_position = byte_stream.tell() @@ -91,10 +103,15 @@ class MarkItUp: category = "docx" elif magic_type == "application/pdf": category = "pdf" + elif magic_type == "application/csv": + category = "csv" elif magic_type.startswith("text/"): - category = "text" + if magic_type == "text/csv": + category = "csv" + else: + category = "text" else: category = "other" byte_stream.seek(original_position) - return StreamInfo(magic_type=magic_type, category=category) \ No newline at end of file + return StreamInfo(magic_type=magic_type, category=category) diff --git a/packages/markitup/src/markitup/_schemas.py b/packages/markitup/src/markitup/_schemas.py new file mode 100644 index 0000000..9cbe1c9 --- /dev/null +++ b/packages/markitup/src/markitup/_schemas.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass, asdict, field +from typing import Optional, List, Literal + + +@dataclass +class StreamInfo: + magic_type: Optional[str] = None + category: Optional[str] = None + + +@dataclass +class Config: + modalities: List[Literal["image", "audio"]] = field( + default_factory=lambda: ["image", "audio"] + ) diff --git a/packages/markitup/src/markitup/_stream_info.py b/packages/markitup/src/markitup/_stream_info.py deleted file mode 100644 index 66e8c72..0000000 --- a/packages/markitup/src/markitup/_stream_info.py +++ /dev/null @@ -1,8 +0,0 @@ -from dataclasses import dataclass, asdict -from typing import Optional - - -@dataclass -class StreamInfo: - magic_type: Optional[str] = None - category: Optional[str] = None \ No newline at end of file diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py index 8d5df3d..a62c3be 100644 --- a/packages/markitup/src/markitup/converter_utils/utils.py +++ b/packages/markitup/src/markitup/converter_utils/utils.py @@ -1,7 +1,11 @@ import os from io import BytesIO -from markitup._stream_info import StreamInfo +from markitup._schemas import StreamInfo import magic +import speech_recognition as sr +import pydub +import io +from typing import BinaryIO def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"): @@ -38,65 +42,23 @@ def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"): return byte_streams -def detect_file_types(file_dict): - """ - Detects file types for a dictionary of {filename: BytesIO} pairs - using only magic type (content-based detection) +def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str: + audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None - Args: - file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values + match audio_format: + case 'mp3': + audio_segment = pydub.AudioSegment.from_file( + file_stream, format=audio_format) + audio_source = io.BytesIO() + audio_segment.export(audio_source, format="wav") + audio_source.seek(0) + case 'wav': + audio_source = file_stream + case _: + raise ValueError(f"Unsupported audio format: {magic_type}") - Returns: - dict: Dictionary with filenames as keys and file type information as values - """ - result = {} - - for filename, byte_stream in file_dict.items(): - # Get the original position to reset later - original_position = byte_stream.tell() - - # Reset stream position to beginning - byte_stream.seek(0) - - # Get file content for analysis - file_content = byte_stream.read() - - # Use python-magic to determine file type based on content - magic_type = magic.from_buffer(file_content, mime=True) - - # Determine file category based on magic_type - if magic_type.startswith("image/"): - category = "image" - elif magic_type.startswith("audio/"): - category = "audio" - elif magic_type.startswith("video/"): - category = "video" - elif ( - magic_type.startswith("application/vnd.ms-excel") - or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ): - category = "xls" - elif ( - magic_type.startswith("application/vnd.ms-powerpoint") - or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ): - category = "ppt" - elif ( - magic_type.startswith("application/msword") - or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ): - category = "doc" - elif magic_type == "application/pdf": - category = "pdf" - elif magic_type.startswith("text/"): - category = "text" - else: - category = "other" - - # Store the results - result[filename] = StreamInfo(magic_type=magic_type, category=category) - - # Reset stream position - byte_stream.seek(original_position) - - return result + recognizer = sr.Recognizer() + with sr.AudioFile(audio_source) as source: + audio = recognizer.record(source) + transcript = recognizer.recognize_google(audio).strip() + return "[No speech detected]" if transcript == "" else transcript diff --git a/packages/markitup/src/markitup/converters/__init__.py b/packages/markitup/src/markitup/converters/__init__.py index a82b80b..5aea2af 100644 --- a/packages/markitup/src/markitup/converters/__init__.py +++ b/packages/markitup/src/markitup/converters/__init__.py @@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter -# from ._audio_converter import AudioConverter +from ._audio_converter import AudioConverter from ._csv_converter import CsvConverter from ._markdownify import _CustomMarkdownify @@ -27,7 +27,7 @@ __all__ = [ "XlsConverter", "PptxConverter", "ImageConverter", - # "AudioConverter", + "AudioConverter", "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py index 57a828d..eec84cd 100644 --- a/packages/markitup/src/markitup/converters/_audio_converter.py +++ b/packages/markitup/src/markitup/converters/_audio_converter.py @@ -1,23 +1,10 @@ import io -from typing import Any, BinaryIO, Optional +from typing import Any, BinaryIO, Optional, Tuple -from ._exiftool import exiftool_metadata from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo, Config from .._exceptions import MissingDependencyException - -ACCEPTED_MIME_TYPE_PREFIXES = [ - "audio/x-wav", - "audio/mpeg", - "video/mp4", -] - -ACCEPTED_FILE_EXTENSIONS = [ - ".wav", - ".mp3", - ".m4a", - ".mp4", -] +from ..converter_utils.utils import transcribe_audio class AudioConverter(DocumentConverter): @@ -25,77 +12,25 @@ class AudioConverter(DocumentConverter): Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). """ - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + def __init__(self, config: Config): + self.config = config def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> DocumentConverterResult: + ** kwargs: Any, # Options to pass to the converter + ) -> Tuple[DocumentConverterResult, StreamInfo]: md_content = "" - # Add metadata - metadata = exiftool_metadata( - file_stream, exiftool_path=kwargs.get("exiftool_path") - ) - if metadata: - for f in [ - "Title", - "Artist", - "Author", - "Band", - "Album", - "Genre", - "Track", - "DateTimeOriginal", - "CreateDate", - # "Duration", -- Wrong values when read from memory - "NumChannels", - "SampleRate", - "AvgBytesPerSec", - "BitsPerSample", - ]: - if f in metadata: - md_content += f"{f}: {metadata[f]}\n" - - # Figure out the audio format for transcription - if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav": - audio_format = "wav" - elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg": - audio_format = "mp3" - elif ( - stream_info.extension in [".mp4", ".m4a"] - or stream_info.mimetype == "video/mp4" - ): - audio_format = "mp4" - else: - audio_format = None - # Transcribe - if audio_format: - try: - transcript = transcribe_audio(file_stream, audio_format=audio_format) - if transcript: - md_content += "\n\n### Audio Transcript:\n" + transcript - except MissingDependencyException: - pass + if 'audio' not in self.config.modalities: + transcript = transcribe_audio( + file_stream, magic_type=stream_info.magic_type) + if transcript: + md_content += "\n\n### Audio Transcript:\n" + transcript + return DocumentConverterResult(markdown=md_content.strip()) + else: + return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info) # Return the result - return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitup/src/markitup/converters/_csv_converter.py b/packages/markitup/src/markitup/converters/_csv_converter.py index 7162889..c68afe2 100644 --- a/packages/markitup/src/markitup/converters/_csv_converter.py +++ b/packages/markitup/src/markitup/converters/_csv_converter.py @@ -3,15 +3,8 @@ import csv import io from typing import BinaryIO, Any from charset_normalizer import from_bytes -from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo - -ACCEPTED_MIME_TYPE_PREFIXES = [ - "text/csv", - "application/csv", -] -ACCEPTED_FILE_EXTENSIONS = [".csv"] +from .._schemas import StreamInfo class CsvConverter(DocumentConverter): @@ -19,24 +12,6 @@ class CsvConverter(DocumentConverter): Converts CSV files to Markdown tables. """ - def __init__(self): - super().__init__() - - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - return False - def convert( self, file_stream: BinaryIO, @@ -44,10 +19,7 @@ class CsvConverter(DocumentConverter): **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Read the file content - if stream_info.charset: - content = file_stream.read().decode(stream_info.charset) - else: - content = str(from_bytes(file_stream.read()).best()) + content = str(from_bytes(file_stream.read()).best()) # Parse CSV content reader = csv.reader(io.StringIO(content)) diff --git a/packages/markitup/src/markitup/converters/_docx_converter.py b/packages/markitup/src/markitup/converters/_docx_converter.py index b320695..450bca1 100644 --- a/packages/markitup/src/markitup/converters/_docx_converter.py +++ b/packages/markitup/src/markitup/converters/_docx_converter.py @@ -5,24 +5,8 @@ from typing import BinaryIO, Any from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later -_dependency_exc_info = None -try: - import mammoth -except ImportError: - # Preserve the error and stack trace for later - _dependency_exc_info = sys.exc_info() - - -ACCEPTED_MIME_TYPE_PREFIXES = [ - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", -] - -ACCEPTED_FILE_EXTENSIONS = [".docx"] +from .._schemas import StreamInfo, Config +import mammoth class DocxConverter(HtmlConverter): @@ -30,27 +14,8 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ - def __init__(self): - super().__init__() - self._html_converter = HtmlConverter() - - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + def __init__(self, config: Config): + self._html_converter = HtmlConverter(config=config) def convert( self, @@ -58,23 +23,11 @@ class DocxConverter(HtmlConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - # Check: the dependencies - if _dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".docx", - feature="docx", - ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _dependency_exc_info[2] - ) style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( - mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, + mammoth.convert_to_html( + pre_process_stream, style_map=style_map).value, **kwargs, ) diff --git a/packages/markitup/src/markitup/converters/_html_converter.py b/packages/markitup/src/markitup/converters/_html_converter.py index b85a68d..e41b34d 100644 --- a/packages/markitup/src/markitup/converters/_html_converter.py +++ b/packages/markitup/src/markitup/converters/_html_converter.py @@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo, Config from ._markdownify import _CustomMarkdownify ACCEPTED_MAGIC_TYPE_PREFIXES = [ @@ -19,6 +19,10 @@ ACCEPTED_FILE_CATEGORY = [ class HtmlConverter(DocumentConverter): """Anything with content type text/html""" + + def __init__(self, config: Config): + self.config = config + def convert( self, file_stream: BinaryIO, @@ -27,7 +31,8 @@ class HtmlConverter(DocumentConverter): ) -> DocumentConverterResult: # Parse the stream encoding = "utf-8" - soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) + soup = BeautifulSoup(file_stream, "html.parser", + from_encoding=encoding) # Remove javascript and style blocks for script in soup(["script", "style"]): @@ -37,15 +42,17 @@ class HtmlConverter(DocumentConverter): body_elm = soup.find("body") webpage_text = "" if body_elm: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) + webpage_text = _CustomMarkdownify( + config=self.config, **kwargs).convert_soup(body_elm) else: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + webpage_text = _CustomMarkdownify( + config=self.config, **kwargs).convert_soup(soup) assert isinstance(webpage_text, str) # remove leading and trailing \n webpage_text = webpage_text.strip() - + print(webpage_text) return DocumentConverterResult( markdown=webpage_text, title=None if soup.title is None else soup.title.string, diff --git a/packages/markitup/src/markitup/converters/_markdownify.py b/packages/markitup/src/markitup/converters/_markdownify.py index 1c386c7..679d9fd 100644 --- a/packages/markitup/src/markitup/converters/_markdownify.py +++ b/packages/markitup/src/markitup/converters/_markdownify.py @@ -3,6 +3,7 @@ import markdownify from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse +from .._schemas import Config class _CustomMarkdownify(markdownify.MarkdownConverter): @@ -15,11 +16,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax """ - def __init__(self, **options: Any): - options["heading_style"] = options.get("heading_style", markdownify.ATX) + def __init__(self, config: Config, **options: Any): + options["heading_style"] = options.get( + "heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) # Explicitly cast options to the expected type if necessary super().__init__(**options) + self.config = config def convert_hn( self, @@ -58,9 +61,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): if href: try: parsed_url = urlparse(href) # type: ignore - if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: return "%s%s%s" % (prefix, text, suffix) - href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + href = urlunparse(parsed_url._replace( + path=quote(unquote(parsed_url.path)))) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) @@ -95,17 +100,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): src = el.attrs.get("src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" - if ( - convert_as_inline - and el.parent.name not in self.options["keep_inline_images_in"] - ): + + if "image" in self.config.modalities: + return "![%s](%s%s)" % (alt, src, title_part) + else: return alt - # Remove dataURIs - if src.startswith("data:") and not self.options["keep_data_uris"]: - src = src.split(",")[0] + "..." - - return "![%s](%s%s)" % (alt, src, title_part) - def convert_soup(self, soup: Any) -> str: - return super().convert_soup(soup) # type: ignore \ No newline at end of file + return super().convert_soup(soup) # type: ignore diff --git a/packages/markitup/src/markitup/converters/_pdf_converter.py b/packages/markitup/src/markitup/converters/_pdf_converter.py index 3f64f42..8839eff 100644 --- a/packages/markitup/src/markitup/converters/_pdf_converter.py +++ b/packages/markitup/src/markitup/converters/_pdf_converter.py @@ -3,7 +3,7 @@ import io import base64 from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo, Config import fitz @@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter): Converts PDFs to Markdown with embedded images. """ + def __init__(self, config: Config): + self.config = config + def convert( self, file_stream: BinaryIO, @@ -21,42 +24,43 @@ class PdfConverter(DocumentConverter): ) -> DocumentConverterResult: # Create a document object from the stream doc = fitz.open(stream=file_stream, filetype="pdf") - + # Extract text and images from all pages markdown_content = "" image_count = 0 for page_num in range(len(doc)): page = doc.load_page(page_num) - + # Get text with the default "text" mode which gives plain text page_text = page.get_text("text") # Add page marker markdown_content += f"\n\n## Page {page_num + 1}\n\n" markdown_content += page_text + "\n\n" - + # Extract images from the page image_list = page.get_images(full=True) - - for img_index, img_info in enumerate(image_list): - xref = img_info[0] # Get the image reference - base_image = doc.extract_image(xref) - - if base_image: - image_bytes = base_image["image"] - image_ext = base_image["ext"] - - try: - # Convert image to base64 for markdown embedding - img_base64 = base64.b64encode(image_bytes).decode('utf-8') - # Add image to markdown with a unique identifier - image_count += 1 - markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" - except Exception as e: - markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" - + if 'image' in self.config.modalities: + for img_index, img_info in enumerate(image_list): + xref = img_info[0] # Get the image reference + base_image = doc.extract_image(xref) + + if base_image: + image_bytes = base_image["image"] + image_ext = base_image["ext"] + + try: + # Convert image to base64 for markdown embedding + img_base64 = base64.b64encode( + image_bytes).decode('utf-8') + # Add image to markdown with a unique identifier + image_count += 1 + markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" + except Exception as e: + markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" + else: + markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n" # Close the document to free resources doc.close() - print(markdown_content) return DocumentConverterResult( markdown=markdown_content, - ) \ No newline at end of file + ) diff --git a/packages/markitup/src/markitup/converters/_plain_text_converter.py b/packages/markitup/src/markitup/converters/_plain_text_converter.py index b7f776e..740a4f7 100644 --- a/packages/markitup/src/markitup/converters/_plain_text_converter.py +++ b/packages/markitup/src/markitup/converters/_plain_text_converter.py @@ -1,11 +1,12 @@ from typing import BinaryIO, Any from charset_normalizer import from_bytes from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" + def convert( self, file_stream: BinaryIO, diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py index f1c112b..31af3cb 100644 --- a/packages/markitup/src/markitup/converters/_pptx_converter.py +++ b/packages/markitup/src/markitup/converters/_pptx_converter.py @@ -10,7 +10,7 @@ from operator import attrgetter from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._stream_info import StreamInfo +from .._schemas import StreamInfo, Config import pptx @@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter): Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ - def __init__(self): - super().__init__() - self._html_converter = HtmlConverter() + def __init__(self, config: Config): + self._html_converter = HtmlConverter(config=config) + self.config = config def convert( self, @@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter): # Also grab any description embedded in the deck try: - alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") + alt_text = shape._element._nvXxPr.cNvPr.attrib.get( + "descr", "") except Exception: # Unable to get alt text pass @@ -69,16 +70,20 @@ class PptxConverter(DocumentConverter): alt_text = re.sub(r"\s+", " ", alt_text).strip() # If keep_data_uris is True, use base64 encoding for images - - blob = shape.image.blob - content_type = shape.image.content_type or "image/png" - b64_string = base64.b64encode(blob).decode("utf-8") - md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" - + if 'image' in self.config.modalities: + blob = shape.image.blob + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + else: + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + \ + "](" + filename + ")\n" # Tables if self._is_table(shape): - md_content += self._convert_table_to_markdown(shape.table, **kwargs) + md_content += self._convert_table_to_markdown( + shape.table, **kwargs) # Charts if shape.has_chart: @@ -93,7 +98,8 @@ class PptxConverter(DocumentConverter): # Group Shapes if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: - sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) + sorted_shapes = sorted( + shape.shapes, key=attrgetter("top", "left")) for subshape in sorted_shapes: get_shape_content(subshape, **kwargs) @@ -141,7 +147,8 @@ class PptxConverter(DocumentConverter): html_table += "" return ( - self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + self._html_converter.convert_string( + html_table, **kwargs).markdown.strip() + "\n" ) diff --git a/packages/markitup/src/markitup/converters/_xlsx_converter.py b/packages/markitup/src/markitup/converters/_xlsx_converter.py index 28f73a0..dc7d4a0 100644 --- a/packages/markitup/src/markitup/converters/_xlsx_converter.py +++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py @@ -1,36 +1,8 @@ -import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE -from .._stream_info import StreamInfo - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later -_xlsx_dependency_exc_info = None -try: - import pandas as pd - import openpyxl -except ImportError: - _xlsx_dependency_exc_info = sys.exc_info() - -_xls_dependency_exc_info = None -try: - import pandas as pd - import xlrd -except ImportError: - _xls_dependency_exc_info = sys.exc_info() - -ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [ - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" -] -ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"] - -ACCEPTED_XLS_MIME_TYPE_PREFIXES = [ - "application/vnd.ms-excel", - "application/excel", -] -ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +from .._schemas import StreamInfo, Config +import pandas as pd class XlsxConverter(DocumentConverter): @@ -38,27 +10,8 @@ class XlsxConverter(DocumentConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__(self): - super().__init__() - self._html_converter = HtmlConverter() - - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_XLSX_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + def __init__(self, config: Config): + self._html_converter = HtmlConverter(config=config) def convert( self, @@ -66,20 +19,6 @@ class XlsxConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - # Check the dependencies - if _xlsx_dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".xlsx", - feature="xlsx", - ) - ) from _xlsx_dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _xlsx_dependency_exc_info[2] - ) - sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") md_content = "" for s in sheets: @@ -100,27 +39,8 @@ class XlsConverter(DocumentConverter): Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__(self): - super().__init__() - self._html_converter = HtmlConverter() - - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_XLS_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + def __init__(self, config: Config): + self._html_converter = HtmlConverter(config=config) def convert( self, @@ -128,19 +48,6 @@ class XlsConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - # Load the dependencies - if _xls_dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".xls", - feature="xls", - ) - ) from _xls_dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _xls_dependency_exc_info[2] - ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") md_content = "" diff --git a/packages/markitup/tests/test_files/test.csv b/packages/markitup/tests/test_files/test.csv new file mode 100644 index 0000000..bb11189 --- /dev/null +++ b/packages/markitup/tests/test_files/test.csv @@ -0,0 +1,51 @@ +ID,Name,Age,Country,Email +1,Name_1,62,Country_1,email_1@example.com +2,Name_2,48,Country_2,email_2@example.com +3,Name_3,61,Country_3,email_3@example.com +4,Name_4,32,Country_4,email_4@example.com +5,Name_5,69,Country_5,email_5@example.com +6,Name_6,32,Country_6,email_6@example.com +7,Name_7,62,Country_7,email_7@example.com +8,Name_8,39,Country_8,email_8@example.com +9,Name_9,40,Country_9,email_9@example.com +10,Name_10,32,Country_0,email_10@example.com +11,Name_11,24,Country_1,email_11@example.com +12,Name_12,45,Country_2,email_12@example.com +13,Name_13,39,Country_3,email_13@example.com +14,Name_14,18,Country_4,email_14@example.com +15,Name_15,66,Country_5,email_15@example.com +16,Name_16,48,Country_6,email_16@example.com +17,Name_17,60,Country_7,email_17@example.com +18,Name_18,31,Country_8,email_18@example.com +19,Name_19,43,Country_9,email_19@example.com +20,Name_20,33,Country_0,email_20@example.com +21,Name_21,32,Country_1,email_21@example.com +22,Name_22,68,Country_2,email_22@example.com +23,Name_23,44,Country_3,email_23@example.com +24,Name_24,32,Country_4,email_24@example.com +25,Name_25,33,Country_5,email_25@example.com +26,Name_26,46,Country_6,email_26@example.com +27,Name_27,38,Country_7,email_27@example.com +28,Name_28,50,Country_8,email_28@example.com +29,Name_29,68,Country_9,email_29@example.com +30,Name_30,66,Country_0,email_30@example.com +31,Name_31,60,Country_1,email_31@example.com +32,Name_32,53,Country_2,email_32@example.com +33,Name_33,30,Country_3,email_33@example.com +34,Name_34,30,Country_4,email_34@example.com +35,Name_35,43,Country_5,email_35@example.com +36,Name_36,44,Country_6,email_36@example.com +37,Name_37,31,Country_7,email_37@example.com +38,Name_38,35,Country_8,email_38@example.com +39,Name_39,56,Country_9,email_39@example.com +40,Name_40,35,Country_0,email_40@example.com +41,Name_41,62,Country_1,email_41@example.com +42,Name_42,63,Country_2,email_42@example.com +43,Name_43,51,Country_3,email_43@example.com +44,Name_44,52,Country_4,email_44@example.com +45,Name_45,66,Country_5,email_45@example.com +46,Name_46,69,Country_6,email_46@example.com +47,Name_47,68,Country_7,email_47@example.com +48,Name_48,68,Country_8,email_48@example.com +49,Name_49,69,Country_9,email_49@example.com +50,Name_50,46,Country_0,email_50@example.com diff --git a/packages/markitup/tests/test_files/test.docx b/packages/markitup/tests/test_files/test.docx old mode 100755 new mode 100644 index 79e281d..b36cfed Binary files a/packages/markitup/tests/test_files/test.docx and b/packages/markitup/tests/test_files/test.docx differ diff --git a/packages/markitup/uv.lock b/packages/markitup/uv.lock index 0ed9063..5d9ff8d 100644 --- a/packages/markitup/uv.lock +++ b/packages/markitup/uv.lock @@ -173,15 +173,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 }, ] -[[package]] -name = "et-xmlfile" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 }, -] - [[package]] name = "flatbuffers" version = "25.2.10" @@ -348,7 +339,6 @@ dependencies = [ { name = "mammoth" }, { name = "markdownify" }, { name = "olefile" }, - { name = "openpyxl" }, { name = "pandas" }, { name = "pydub" }, { name = "pymupdf" }, @@ -356,7 +346,6 @@ dependencies = [ { name = "python-pptx" }, { name = "requests" }, { name = "speechrecognition" }, - { name = "xlrd" }, ] [package.metadata] @@ -368,7 +357,6 @@ requires-dist = [ { name = "mammoth" }, { name = "markdownify" }, { name = "olefile" }, - { name = "openpyxl" }, { name = "pandas" }, { name = "pydub" }, { name = "pymupdf", specifier = ">=1.25.5" }, @@ -376,7 +364,6 @@ requires-dist = [ { name = "python-pptx" }, { name = "requests" }, { name = "speechrecognition" }, - { name = "xlrd" }, ] [[package]] @@ -492,18 +479,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 }, ] -[[package]] -name = "openpyxl" -version = "3.1.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "et-xmlfile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 }, -] - [[package]] name = "packaging" version = "25.0" @@ -847,15 +822,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 }, ] -[[package]] -name = "xlrd" -version = "2.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a6/b3/19a2540d21dea5f908304375bd43f5ed7a4c28a370dc9122c565423e6b44/xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88", size = 100259 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a6/0c/c2a72d51fe56e08a08acc85d13013558a2d793028ae7385448a6ccdfae64/xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd", size = 96531 }, -] - [[package]] name = "xlsxwriter" version = "3.2.3"