diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index ee1abf1..a0b2186 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -40,14 +40,6 @@ class MarkItUp: def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]: stream_info: StreamInfo = self._get_stream_info(stream) # Deal with unsupported file types - match stream_info.category: - case "ppt": - raise UnsupportedFormatException( - ".ppt files are not supported, try .pptx instead") - case "other": - raise UnsupportedFormatException( - f"{stream_info.magic_type} files are not supported") - try: match stream_info.category: case "text": @@ -59,13 +51,24 @@ class MarkItUp: case "audio": return AudioConverter(config=self.config).convert(stream, stream_info), stream_info case "xlsx": - return XlsxConverter().convert(stream, stream_info), stream_info + return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info case "xls": - return XlsConverter().convert(stream, stream_info), stream_info + return XlsConverter(config=self.config).convert(stream, stream_info), stream_info + case "csv": + return CsvConverter().convert(stream, stream_info), stream_info + case "docx": + return DocxConverter(config=self.config).convert(stream, stream_info), stream_info + case _: + match stream_info.category: + case "ppt": + raise UnsupportedFormatException( + ".ppt files are not supported, try .pptx instead") + case "other": + raise UnsupportedFormatException( + f"{stream_info.magic_type} files are not supported") except FailedConversionAttempt: raise FileConversionException( f"Failed to convert file of type {stream_info.magic_type}") - return stream_info def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo: original_position = byte_stream.tell() @@ -100,8 +103,13 @@ class MarkItUp: category = "docx" elif magic_type == "application/pdf": category = "pdf" + elif magic_type == "application/csv": + category = "csv" elif magic_type.startswith("text/"): - category = "text" + if magic_type == "text/csv": + category = "csv" + else: + category = "text" else: category = "other" diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py index e67c9b5..a62c3be 100644 --- a/packages/markitup/src/markitup/converter_utils/utils.py +++ b/packages/markitup/src/markitup/converter_utils/utils.py @@ -42,64 +42,6 @@ def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"): return byte_streams -def detect_file_types(file_dict): - """ - Detects file types for a dictionary of {filename: BytesIO} pairs - using only magic type (content-based detection) - - Args: - file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values - - Returns: - dict: Dictionary with filenames as keys and file type information as values - """ - result = {} - - for filename, byte_stream in file_dict.items(): - # Get the original position to reset later - original_position = byte_stream.tell() - - # Reset stream position to beginning - byte_stream.seek(0) - - # Get file content for analysis - file_content = byte_stream.read() - - # Use python-magic to determine file type based on content - magic_type = magic.from_buffer(file_content, mime=True) - - # Determine file category based on magic_type - if magic_type.startswith("image/"): - category = "image" - elif magic_type.startswith("audio/"): - category = "audio" - elif magic_type.startswith("video/"): - category = "video" - elif magic_type.startswith("application/vnd.ms-excel"): - category = 'xls' - elif magic_type.startswith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"): - category = "xlsx" - elif magic_type.startswith("application/vnd.ms-powerpoint"): - category = 'ppt' - elif magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": - category = "pptx" - elif magic_type.startswith("application/msword"): - category = 'doc' - elif magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": - category = "docx" - elif magic_type == "application/pdf": - category = "pdf" - elif magic_type.startswith("text/"): - category = "text" - else: - category = "other" - - byte_stream.seek(original_position) - result[filename] = StreamInfo(magic_type=magic_type, category=category) - - return result - - def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str: audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None diff --git a/packages/markitup/src/markitup/converters/_csv_converter.py b/packages/markitup/src/markitup/converters/_csv_converter.py index 78963ed..c68afe2 100644 --- a/packages/markitup/src/markitup/converters/_csv_converter.py +++ b/packages/markitup/src/markitup/converters/_csv_converter.py @@ -3,40 +3,15 @@ import csv import io from typing import BinaryIO, Any from charset_normalizer import from_bytes -from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._schemas import StreamInfo -ACCEPTED_MIME_TYPE_PREFIXES = [ - "text/csv", - "application/csv", -] -ACCEPTED_FILE_EXTENSIONS = [".csv"] - class CsvConverter(DocumentConverter): """ Converts CSV files to Markdown tables. """ - def __init__(self): - super().__init__() - - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - return False - def convert( self, file_stream: BinaryIO, @@ -44,10 +19,7 @@ class CsvConverter(DocumentConverter): **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Read the file content - if stream_info.charset: - content = file_stream.read().decode(stream_info.charset) - else: - content = str(from_bytes(file_stream.read()).best()) + content = str(from_bytes(file_stream.read()).best()) # Parse CSV content reader = csv.reader(io.StringIO(content)) diff --git a/packages/markitup/src/markitup/converters/_docx_converter.py b/packages/markitup/src/markitup/converters/_docx_converter.py index 0db97a9..450bca1 100644 --- a/packages/markitup/src/markitup/converters/_docx_converter.py +++ b/packages/markitup/src/markitup/converters/_docx_converter.py @@ -5,24 +5,8 @@ from typing import BinaryIO, Any from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later -_dependency_exc_info = None -try: - import mammoth -except ImportError: - # Preserve the error and stack trace for later - _dependency_exc_info = sys.exc_info() - - -ACCEPTED_MIME_TYPE_PREFIXES = [ - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", -] - -ACCEPTED_FILE_EXTENSIONS = [".docx"] +from .._schemas import StreamInfo, Config +import mammoth class DocxConverter(HtmlConverter): @@ -30,27 +14,8 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ - def __init__(self): - super().__init__() - self._html_converter = HtmlConverter() - - def accepts( - self, - file_stream: BinaryIO, - stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter - ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() - - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + def __init__(self, config: Config): + self._html_converter = HtmlConverter(config=config) def convert( self, @@ -58,19 +23,6 @@ class DocxConverter(HtmlConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - # Check: the dependencies - if _dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".docx", - feature="docx", - ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _dependency_exc_info[2] - ) style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) diff --git a/packages/markitup/src/markitup/converters/_html_converter.py b/packages/markitup/src/markitup/converters/_html_converter.py index 91db39a..e41b34d 100644 --- a/packages/markitup/src/markitup/converters/_html_converter.py +++ b/packages/markitup/src/markitup/converters/_html_converter.py @@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config from ._markdownify import _CustomMarkdownify ACCEPTED_MAGIC_TYPE_PREFIXES = [ @@ -20,6 +20,9 @@ ACCEPTED_FILE_CATEGORY = [ class HtmlConverter(DocumentConverter): """Anything with content type text/html""" + def __init__(self, config: Config): + self.config = config + def convert( self, file_stream: BinaryIO, @@ -39,15 +42,17 @@ class HtmlConverter(DocumentConverter): body_elm = soup.find("body") webpage_text = "" if body_elm: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) + webpage_text = _CustomMarkdownify( + config=self.config, **kwargs).convert_soup(body_elm) else: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + webpage_text = _CustomMarkdownify( + config=self.config, **kwargs).convert_soup(soup) assert isinstance(webpage_text, str) # remove leading and trailing \n webpage_text = webpage_text.strip() - + print(webpage_text) return DocumentConverterResult( markdown=webpage_text, title=None if soup.title is None else soup.title.string, diff --git a/packages/markitup/src/markitup/converters/_markdownify.py b/packages/markitup/src/markitup/converters/_markdownify.py index 1c386c7..679d9fd 100644 --- a/packages/markitup/src/markitup/converters/_markdownify.py +++ b/packages/markitup/src/markitup/converters/_markdownify.py @@ -3,6 +3,7 @@ import markdownify from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse +from .._schemas import Config class _CustomMarkdownify(markdownify.MarkdownConverter): @@ -15,11 +16,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax """ - def __init__(self, **options: Any): - options["heading_style"] = options.get("heading_style", markdownify.ATX) + def __init__(self, config: Config, **options: Any): + options["heading_style"] = options.get( + "heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) # Explicitly cast options to the expected type if necessary super().__init__(**options) + self.config = config def convert_hn( self, @@ -58,9 +61,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): if href: try: parsed_url = urlparse(href) # type: ignore - if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: return "%s%s%s" % (prefix, text, suffix) - href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + href = urlunparse(parsed_url._replace( + path=quote(unquote(parsed_url.path)))) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) @@ -95,17 +100,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): src = el.attrs.get("src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" - if ( - convert_as_inline - and el.parent.name not in self.options["keep_inline_images_in"] - ): + + if "image" in self.config.modalities: + return "![%s](%s%s)" % (alt, src, title_part) + else: return alt - # Remove dataURIs - if src.startswith("data:") and not self.options["keep_data_uris"]: - src = src.split(",")[0] + "..." - - return "![%s](%s%s)" % (alt, src, title_part) - def convert_soup(self, soup: Any) -> str: - return super().convert_soup(soup) # type: ignore \ No newline at end of file + return super().convert_soup(soup) # type: ignore diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py index 5f11d37..31af3cb 100644 --- a/packages/markitup/src/markitup/converters/_pptx_converter.py +++ b/packages/markitup/src/markitup/converters/_pptx_converter.py @@ -27,7 +27,7 @@ class PptxConverter(DocumentConverter): """ def __init__(self, config: Config): - self._html_converter = HtmlConverter() + self._html_converter = HtmlConverter(config=config) self.config = config def convert( diff --git a/packages/markitup/src/markitup/converters/_xlsx_converter.py b/packages/markitup/src/markitup/converters/_xlsx_converter.py index f7e9879..dc7d4a0 100644 --- a/packages/markitup/src/markitup/converters/_xlsx_converter.py +++ b/packages/markitup/src/markitup/converters/_xlsx_converter.py @@ -1,7 +1,7 @@ from typing import BinaryIO, Any from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult -from .._schemas import StreamInfo +from .._schemas import StreamInfo, Config import pandas as pd @@ -10,8 +10,8 @@ class XlsxConverter(DocumentConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__(self): - self._html_converter = HtmlConverter() + def __init__(self, config: Config): + self._html_converter = HtmlConverter(config=config) def convert( self, @@ -39,9 +39,8 @@ class XlsConverter(DocumentConverter): Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. """ - def __init__(self): - super().__init__() - self._html_converter = HtmlConverter() + def __init__(self, config: Config): + self._html_converter = HtmlConverter(config=config) def convert( self, diff --git a/packages/markitup/tests/test_files/test.csv b/packages/markitup/tests/test_files/test.csv new file mode 100644 index 0000000..bb11189 --- /dev/null +++ b/packages/markitup/tests/test_files/test.csv @@ -0,0 +1,51 @@ +ID,Name,Age,Country,Email +1,Name_1,62,Country_1,email_1@example.com +2,Name_2,48,Country_2,email_2@example.com +3,Name_3,61,Country_3,email_3@example.com +4,Name_4,32,Country_4,email_4@example.com +5,Name_5,69,Country_5,email_5@example.com +6,Name_6,32,Country_6,email_6@example.com +7,Name_7,62,Country_7,email_7@example.com +8,Name_8,39,Country_8,email_8@example.com +9,Name_9,40,Country_9,email_9@example.com +10,Name_10,32,Country_0,email_10@example.com +11,Name_11,24,Country_1,email_11@example.com +12,Name_12,45,Country_2,email_12@example.com +13,Name_13,39,Country_3,email_13@example.com +14,Name_14,18,Country_4,email_14@example.com +15,Name_15,66,Country_5,email_15@example.com +16,Name_16,48,Country_6,email_16@example.com +17,Name_17,60,Country_7,email_17@example.com +18,Name_18,31,Country_8,email_18@example.com +19,Name_19,43,Country_9,email_19@example.com +20,Name_20,33,Country_0,email_20@example.com +21,Name_21,32,Country_1,email_21@example.com +22,Name_22,68,Country_2,email_22@example.com +23,Name_23,44,Country_3,email_23@example.com +24,Name_24,32,Country_4,email_24@example.com +25,Name_25,33,Country_5,email_25@example.com +26,Name_26,46,Country_6,email_26@example.com +27,Name_27,38,Country_7,email_27@example.com +28,Name_28,50,Country_8,email_28@example.com +29,Name_29,68,Country_9,email_29@example.com +30,Name_30,66,Country_0,email_30@example.com +31,Name_31,60,Country_1,email_31@example.com +32,Name_32,53,Country_2,email_32@example.com +33,Name_33,30,Country_3,email_33@example.com +34,Name_34,30,Country_4,email_34@example.com +35,Name_35,43,Country_5,email_35@example.com +36,Name_36,44,Country_6,email_36@example.com +37,Name_37,31,Country_7,email_37@example.com +38,Name_38,35,Country_8,email_38@example.com +39,Name_39,56,Country_9,email_39@example.com +40,Name_40,35,Country_0,email_40@example.com +41,Name_41,62,Country_1,email_41@example.com +42,Name_42,63,Country_2,email_42@example.com +43,Name_43,51,Country_3,email_43@example.com +44,Name_44,52,Country_4,email_44@example.com +45,Name_45,66,Country_5,email_45@example.com +46,Name_46,69,Country_6,email_46@example.com +47,Name_47,68,Country_7,email_47@example.com +48,Name_48,68,Country_8,email_48@example.com +49,Name_49,69,Country_9,email_49@example.com +50,Name_50,46,Country_0,email_50@example.com diff --git a/packages/markitup/tests/test_files/test.docx b/packages/markitup/tests/test_files/test.docx old mode 100755 new mode 100644 index 79e281d..b36cfed Binary files a/packages/markitup/tests/test_files/test.docx and b/packages/markitup/tests/test_files/test.docx differ