From 555a849a6603a73b1534321e45146bf1a13817ab Mon Sep 17 00:00:00 2001 From: rong-xyz Date: Mon, 21 Apr 2025 09:37:43 +0000 Subject: [PATCH] supports pptx --- packages/markitup/pyproject.toml | 17 +-- packages/markitup/src/markitup/_markitup.py | 12 +- .../markitup/src/markitup/_stream_info.py | 30 +---- .../src/markitup/converter_utils/utils.py | 102 ++++++++++++++++ .../src/markitup/converters/__init__.py | 6 +- .../markitup/converters/_audio_converter.py | 1 - .../markitup/converters/_html_converter.py | 22 ++-- .../src/markitup/converters/_markdownify.py | 111 ++++++++++++++++++ .../markitup/converters/_pptx_converter.py | 53 +++------ packages/markitup/uv.lock | 92 +++++---------- 10 files changed, 277 insertions(+), 169 deletions(-) create mode 100644 packages/markitup/src/markitup/converter_utils/utils.py create mode 100644 packages/markitup/src/markitup/converters/_markdownify.py diff --git a/packages/markitup/pyproject.toml b/packages/markitup/pyproject.toml index 17c2488..819ce19 100644 --- a/packages/markitup/pyproject.toml +++ b/packages/markitup/pyproject.toml @@ -28,10 +28,7 @@ dependencies = [ "markdownify", "magika~=0.6.1", "charset-normalizer", -] - -[project.optional-dependencies] -all = [ + "python-magic>=0.4.27", "python-pptx", "mammoth", "pandas", @@ -46,12 +43,6 @@ all = [ "azure-ai-documentintelligence", "azure-identity" ] -pptx = ["python-pptx"] -docx = ["mammoth", "lxml"] -xlsx = ["pandas", "openpyxl"] -xls = ["pandas", "xlrd"] -pdf = ["pdfminer.six"] - [tool.hatch.version] path = "src/markitup/__about__.py" @@ -60,16 +51,14 @@ path = "src/markitup/__about__.py" markitup = "markitup.__main__:main" [tool.hatch.envs.default] -features = ["all"] +# No features needed since everything is installed by default [tool.hatch.envs.hatch-test] -features = ["all"] extra-dependencies = [ "openai", ] [tool.hatch.envs.types] -features = ["all"] extra-dependencies = [ "openai", "mypy>=1.0.0", @@ -98,4 +87,4 @@ exclude_lines = [ ] [tool.hatch.build.targets.sdist] -only-include = ["src/markitup"] +only-include = ["src/markitup"] \ No newline at end of file diff --git a/packages/markitup/src/markitup/_markitup.py b/packages/markitup/src/markitup/_markitup.py index 9777286..a17f3a6 100644 --- a/packages/markitup/src/markitup/_markitup.py +++ b/packages/markitup/src/markitup/_markitup.py @@ -25,22 +25,12 @@ from ._uri_utils import parse_data_uri, file_uri_to_path from .converters import ( PlainTextConverter, HtmlConverter, - RssConverter, - WikipediaConverter, - YouTubeConverter, - IpynbConverter, - BingSerpConverter, PdfConverter, DocxConverter, XlsxConverter, XlsConverter, PptxConverter, - ImageConverter, - AudioConverter, - OutlookMsgConverter, - ZipConverter, - EpubConverter, - DocumentIntelligenceConverter, + # AudioConverter, CsvConverter, ) diff --git a/packages/markitup/src/markitup/_stream_info.py b/packages/markitup/src/markitup/_stream_info.py index 84a1f64..66e8c72 100644 --- a/packages/markitup/src/markitup/_stream_info.py +++ b/packages/markitup/src/markitup/_stream_info.py @@ -2,31 +2,7 @@ from dataclasses import dataclass, asdict from typing import Optional -@dataclass(kw_only=True, frozen=True) +@dataclass class StreamInfo: - """The StreamInfo class is used to store information about a file stream. - All fields can be None, and will depend on how the stream was opened. - """ - - mimetype: Optional[str] = None - extension: Optional[str] = None - charset: Optional[str] = None - filename: Optional[ - str - ] = None # From local path, url, or Content-Disposition header - local_path: Optional[str] = None # If read from disk - url: Optional[str] = None # If read from url - - def copy_and_update(self, *args, **kwargs): - """Copy the StreamInfo object and update it with the given StreamInfo - instance and/or other keyword arguments.""" - new_info = asdict(self) - - for si in args: - assert isinstance(si, StreamInfo) - new_info.update({k: v for k, v in asdict(si).items() if v is not None}) - - if len(kwargs) > 0: - new_info.update(kwargs) - - return StreamInfo(**new_info) + magic_type: Optional[str] = None + category: Optional[str] = None \ No newline at end of file diff --git a/packages/markitup/src/markitup/converter_utils/utils.py b/packages/markitup/src/markitup/converter_utils/utils.py new file mode 100644 index 0000000..8d5df3d --- /dev/null +++ b/packages/markitup/src/markitup/converter_utils/utils.py @@ -0,0 +1,102 @@ +import os +from io import BytesIO +from markitup._stream_info import StreamInfo +import magic + + +def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"): + """ + Reads all files from the specified folder into BytesIO objects. + + Args: + folder_path (str): Path to the folder containing files + + Returns: + dict: Dictionary with filenames as keys and BytesIO objects as values + """ + byte_streams = {} + + # Check if folder exists + if not os.path.exists(folder_path): + raise FileNotFoundError(f"Folder '{folder_path}' not found") + + # Iterate through all files in the folder + for filename in sorted(os.listdir(folder_path)): + file_path = os.path.join(folder_path, filename) + + # Check if it's a file (not a subdirectory) + if os.path.isfile(file_path): + # Read file in binary mode + with open(file_path, "rb") as f: + # Create BytesIO object with file content + file_bytes = BytesIO(f.read()) + # Add to dictionary with filename as key + byte_streams[filename] = file_bytes + # Reset BytesIO position to beginning + file_bytes.seek(0) + + return byte_streams + + +def detect_file_types(file_dict): + """ + Detects file types for a dictionary of {filename: BytesIO} pairs + using only magic type (content-based detection) + + Args: + file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values + + Returns: + dict: Dictionary with filenames as keys and file type information as values + """ + result = {} + + for filename, byte_stream in file_dict.items(): + # Get the original position to reset later + original_position = byte_stream.tell() + + # Reset stream position to beginning + byte_stream.seek(0) + + # Get file content for analysis + file_content = byte_stream.read() + + # Use python-magic to determine file type based on content + magic_type = magic.from_buffer(file_content, mime=True) + + # Determine file category based on magic_type + if magic_type.startswith("image/"): + category = "image" + elif magic_type.startswith("audio/"): + category = "audio" + elif magic_type.startswith("video/"): + category = "video" + elif ( + magic_type.startswith("application/vnd.ms-excel") + or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ): + category = "xls" + elif ( + magic_type.startswith("application/vnd.ms-powerpoint") + or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ): + category = "ppt" + elif ( + magic_type.startswith("application/msword") + or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ): + category = "doc" + elif magic_type == "application/pdf": + category = "pdf" + elif magic_type.startswith("text/"): + category = "text" + else: + category = "other" + + # Store the results + result[filename] = StreamInfo(magic_type=magic_type, category=category) + + # Reset stream position + byte_stream.seek(original_position) + + return result diff --git a/packages/markitup/src/markitup/converters/__init__.py b/packages/markitup/src/markitup/converters/__init__.py index 775e4c0..a82b80b 100644 --- a/packages/markitup/src/markitup/converters/__init__.py +++ b/packages/markitup/src/markitup/converters/__init__.py @@ -8,13 +8,15 @@ from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter -from ._audio_converter import AudioConverter +# from ._audio_converter import AudioConverter from ._csv_converter import CsvConverter +from ._markdownify import _CustomMarkdownify __all__ = [ "PlainTextConverter", "HtmlConverter", "RssConverter", + "_CustomMarkdownify", "WikipediaConverter", "YouTubeConverter", "IpynbConverter", @@ -25,7 +27,7 @@ __all__ = [ "XlsConverter", "PptxConverter", "ImageConverter", - "AudioConverter", + # "AudioConverter", "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", diff --git a/packages/markitup/src/markitup/converters/_audio_converter.py b/packages/markitup/src/markitup/converters/_audio_converter.py index 845ad5d..57a828d 100644 --- a/packages/markitup/src/markitup/converters/_audio_converter.py +++ b/packages/markitup/src/markitup/converters/_audio_converter.py @@ -2,7 +2,6 @@ import io from typing import Any, BinaryIO, Optional from ._exiftool import exiftool_metadata -from ._transcribe_audio import transcribe_audio from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException diff --git a/packages/markitup/src/markitup/converters/_html_converter.py b/packages/markitup/src/markitup/converters/_html_converter.py index dabb0d7..d4bb3aa 100644 --- a/packages/markitup/src/markitup/converters/_html_converter.py +++ b/packages/markitup/src/markitup/converters/_html_converter.py @@ -6,12 +6,12 @@ from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify -ACCEPTED_MIME_TYPE_PREFIXES = [ +ACCEPTED_MAGIC_TYPE_PREFIXES = [ "text/html", "application/xhtml", ] -ACCEPTED_FILE_EXTENSIONS = [ +ACCEPTED_FILE_CATEGORY = [ ".html", ".htm", ] @@ -26,14 +26,14 @@ class HtmlConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() + magic_type = (stream_info.magic_type or "").lower() + category = (stream_info.category or "").lower() - if extension in ACCEPTED_FILE_EXTENSIONS: + if category in ACCEPTED_FILE_CATEGORY: return True - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): + for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES: + if magic_type.startswith(prefix): return True return False @@ -45,7 +45,7 @@ class HtmlConverter(DocumentConverter): **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Parse the stream - encoding = "utf-8" if stream_info.charset is None else stream_info.charset + encoding = "utf-8" soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Remove javascript and style blocks @@ -81,10 +81,8 @@ class HtmlConverter(DocumentConverter): return self.convert( file_stream=io.BytesIO(html_content.encode("utf-8")), stream_info=StreamInfo( - mimetype="text/html", - extension=".html", - charset="utf-8", - url=url, + magic_type="text/html", + category="text", ), **kwargs, ) diff --git a/packages/markitup/src/markitup/converters/_markdownify.py b/packages/markitup/src/markitup/converters/_markdownify.py new file mode 100644 index 0000000..1c386c7 --- /dev/null +++ b/packages/markitup/src/markitup/converters/_markdownify.py @@ -0,0 +1,111 @@ +import re +import markdownify + +from typing import Any, Optional +from urllib.parse import quote, unquote, urlparse, urlunparse + + +class _CustomMarkdownify(markdownify.MarkdownConverter): + """ + A custom version of markdownify's MarkdownConverter. Changes include: + + - Altering the default heading style to use '#', '##', etc. + - Removing javascript hyperlinks. + - Truncating images with large data:uri sources. + - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax + """ + + def __init__(self, **options: Any): + options["heading_style"] = options.get("heading_style", markdownify.ATX) + options["keep_data_uris"] = options.get("keep_data_uris", False) + # Explicitly cast options to the expected type if necessary + super().__init__(**options) + + def convert_hn( + self, + n: int, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: + """Same as usual, but be sure to start with a new line""" + if not convert_as_inline: + if not re.search(r"^\n", text): + return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore + + return super().convert_hn(n, el, text, convert_as_inline) # type: ignore + + def convert_a( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ): + """Same as usual converter, but removes Javascript links and escapes URIs.""" + prefix, suffix, text = markdownify.chomp(text) # type: ignore + if not text: + return "" + + if el.find_parent("pre") is not None: + return text + + href = el.get("href") + title = el.get("title") + + # Escape URIs and skip non-http or file schemes + if href: + try: + parsed_url = urlparse(href) # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + return "%s%s%s" % (prefix, text, suffix) + href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + except ValueError: # It's not clear if this ever gets thrown + return "%s%s%s" % (prefix, text, suffix) + + # For the replacement see #29: text nodes underscores are escaped + if ( + self.options["autolinks"] + and text.replace(r"\_", "_") == href + and not title + and not self.options["default_title"] + ): + # Shortcut syntax + return "<%s>" % href + if self.options["default_title"] and not title: + title = href + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + return ( + "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) + if href + else text + ) + + def convert_img( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: + """Same as usual converter, but removes data URIs""" + + alt = el.attrs.get("alt", None) or "" + src = el.attrs.get("src", None) or "" + title = el.attrs.get("title", None) or "" + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + if ( + convert_as_inline + and el.parent.name not in self.options["keep_inline_images_in"] + ): + return alt + + # Remove dataURIs + if src.startswith("data:") and not self.options["keep_data_uris"]: + src = src.split(",")[0] + "..." + + return "![%s](%s%s)" % (alt, src, title_part) + + def convert_soup(self, soup: Any) -> str: + return super().convert_soup(soup) # type: ignore \ No newline at end of file diff --git a/packages/markitup/src/markitup/converters/_pptx_converter.py b/packages/markitup/src/markitup/converters/_pptx_converter.py index 23bb7f9..d6a0b66 100644 --- a/packages/markitup/src/markitup/converters/_pptx_converter.py +++ b/packages/markitup/src/markitup/converters/_pptx_converter.py @@ -11,23 +11,14 @@ from operator import attrgetter from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later -_dependency_exc_info = None -try: - import pptx -except ImportError: - # Preserve the error and stack trace for later - _dependency_exc_info = sys.exc_info() +import pptx -ACCEPTED_MIME_TYPE_PREFIXES = [ +ACCEPTED_MAGIC_TYPE_PREFIXES = [ "application/vnd.openxmlformats-officedocument.presentationml", ] -ACCEPTED_FILE_EXTENSIONS = [".pptx"] +ACCEPTED_FILE_CATEGORY = [".pptx"] class PptxConverter(DocumentConverter): @@ -45,14 +36,14 @@ class PptxConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: - mimetype = (stream_info.mimetype or "").lower() - extension = (stream_info.extension or "").lower() + magic_type = (stream_info.magic_type or "").lower() + category = (stream_info.category or "").lower() - if extension in ACCEPTED_FILE_EXTENSIONS: + if category in ACCEPTED_FILE_CATEGORY: return True - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): + for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES: + if magic_type.startswith(prefix): return True return False @@ -63,19 +54,6 @@ class PptxConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: - # Check the dependencies - if _dependency_exc_info is not None: - raise MissingDependencyException( - MISSING_DEPENDENCY_MESSAGE.format( - converter=type(self).__name__, - extension=".pptx", - feature="pptx", - ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] - _dependency_exc_info[2] - ) # Perform the conversion presentation = pptx.Presentation(file_stream) @@ -109,15 +87,12 @@ class PptxConverter(DocumentConverter): alt_text = re.sub(r"\s+", " ", alt_text).strip() # If keep_data_uris is True, use base64 encoding for images - if kwargs.get("keep_data_uris", False): - blob = shape.image.blob - content_type = shape.image.content_type or "image/png" - b64_string = base64.b64encode(blob).decode("utf-8") - md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" - else: - # A placeholder name - filename = re.sub(r"\W", "", shape.name) + ".jpg" - md_content += "\n![" + alt_text + "](" + filename + ")\n" + + blob = shape.image.blob + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + # Tables if self._is_table(shape): diff --git a/packages/markitup/uv.lock b/packages/markitup/uv.lock index def8c93..67288d7 100644 --- a/packages/markitup/uv.lock +++ b/packages/markitup/uv.lock @@ -505,92 +505,49 @@ wheels = [ name = "markitup" source = { editable = "." } dependencies = [ - { name = "beautifulsoup4" }, - { name = "charset-normalizer" }, - { name = "magika" }, - { name = "markdownify" }, - { name = "requests" }, -] - -[package.optional-dependencies] -all = [ { name = "azure-ai-documentintelligence" }, { name = "azure-identity" }, + { name = "beautifulsoup4" }, + { name = "charset-normalizer" }, { name = "lxml" }, + { name = "magika" }, { name = "mammoth" }, + { name = "markdownify" }, { name = "olefile" }, { name = "openpyxl" }, { name = "pandas" }, { name = "pdfminer-six" }, { name = "pydub" }, + { name = "python-magic" }, { name = "python-pptx" }, + { name = "requests" }, { name = "speechrecognition" }, { name = "xlrd" }, { name = "youtube-transcript-api" }, ] -audio-transcription = [ - { name = "pydub" }, - { name = "speechrecognition" }, -] -docx = [ - { name = "lxml" }, - { name = "mammoth" }, -] -outlook = [ - { name = "olefile" }, -] -pdf = [ - { name = "pdfminer-six" }, -] -pptx = [ - { name = "python-pptx" }, -] -xls = [ - { name = "pandas" }, - { name = "xlrd" }, -] -xlsx = [ - { name = "openpyxl" }, - { name = "pandas" }, -] -youtube-transcription = [ - { name = "youtube-transcript-api" }, -] [package.metadata] requires-dist = [ - { name = "azure-ai-documentintelligence", marker = "extra == 'all'" }, - { name = "azure-identity", marker = "extra == 'all'" }, + { name = "azure-ai-documentintelligence" }, + { name = "azure-identity" }, { name = "beautifulsoup4" }, { name = "charset-normalizer" }, - { name = "lxml", marker = "extra == 'all'" }, - { name = "lxml", marker = "extra == 'docx'" }, + { name = "lxml" }, { name = "magika", specifier = "~=0.6.1" }, - { name = "mammoth", marker = "extra == 'all'" }, - { name = "mammoth", marker = "extra == 'docx'" }, + { name = "mammoth" }, { name = "markdownify" }, - { name = "olefile", marker = "extra == 'all'" }, - { name = "olefile", marker = "extra == 'outlook'" }, - { name = "openpyxl", marker = "extra == 'all'" }, - { name = "openpyxl", marker = "extra == 'xlsx'" }, - { name = "pandas", marker = "extra == 'all'" }, - { name = "pandas", marker = "extra == 'xls'" }, - { name = "pandas", marker = "extra == 'xlsx'" }, - { name = "pdfminer-six", marker = "extra == 'all'" }, - { name = "pdfminer-six", marker = "extra == 'pdf'" }, - { name = "pydub", marker = "extra == 'all'" }, - { name = "pydub", marker = "extra == 'audio-transcription'" }, - { name = "python-pptx", marker = "extra == 'all'" }, - { name = "python-pptx", marker = "extra == 'pptx'" }, + { name = "olefile" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pdfminer-six" }, + { name = "pydub" }, + { name = "python-magic", specifier = ">=0.4.27" }, + { name = "python-pptx" }, { name = "requests" }, - { name = "speechrecognition", marker = "extra == 'all'" }, - { name = "speechrecognition", marker = "extra == 'audio-transcription'" }, - { name = "xlrd", marker = "extra == 'all'" }, - { name = "xlrd", marker = "extra == 'xls'" }, - { name = "youtube-transcript-api", marker = "extra == 'all'", specifier = "~=1.0.0" }, - { name = "youtube-transcript-api", marker = "extra == 'youtube-transcription'" }, + { name = "speechrecognition" }, + { name = "xlrd" }, + { name = "youtube-transcript-api", specifier = "~=1.0.0" }, ] -provides-extras = ["all", "audio-transcription", "docx", "outlook", "pdf", "pptx", "xls", "xlsx", "youtube-transcription"] [[package]] name = "mpmath" @@ -966,6 +923,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 }, ] +[[package]] +name = "python-magic" +version = "0.4.27" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/da/db/0b3e28ac047452d079d375ec6798bf76a036a08182dbb39ed38116a49130/python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b", size = 14677 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3", size = 13840 }, +] + [[package]] name = "python-pptx" version = "1.0.2"