supports pptx

This commit is contained in:
rong-xyz 2025-04-21 09:37:43 +00:00
parent 615975f918
commit 555a849a66
10 changed files with 277 additions and 169 deletions

View file

@ -28,10 +28,7 @@ dependencies = [
"markdownify",
"magika~=0.6.1",
"charset-normalizer",
]
[project.optional-dependencies]
all = [
"python-magic>=0.4.27",
"python-pptx",
"mammoth",
"pandas",
@ -46,12 +43,6 @@ all = [
"azure-ai-documentintelligence",
"azure-identity"
]
pptx = ["python-pptx"]
docx = ["mammoth", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
[tool.hatch.version]
path = "src/markitup/__about__.py"
@ -60,16 +51,14 @@ path = "src/markitup/__about__.py"
markitup = "markitup.__main__:main"
[tool.hatch.envs.default]
features = ["all"]
# No features needed since everything is installed by default
[tool.hatch.envs.hatch-test]
features = ["all"]
extra-dependencies = [
"openai",
]
[tool.hatch.envs.types]
features = ["all"]
extra-dependencies = [
"openai",
"mypy>=1.0.0",
@ -98,4 +87,4 @@ exclude_lines = [
]
[tool.hatch.build.targets.sdist]
only-include = ["src/markitup"]
only-include = ["src/markitup"]

View file

@ -25,22 +25,12 @@ from ._uri_utils import parse_data_uri, file_uri_to_path
from .converters import (
PlainTextConverter,
HtmlConverter,
RssConverter,
WikipediaConverter,
YouTubeConverter,
IpynbConverter,
BingSerpConverter,
PdfConverter,
DocxConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
ImageConverter,
AudioConverter,
OutlookMsgConverter,
ZipConverter,
EpubConverter,
DocumentIntelligenceConverter,
# AudioConverter,
CsvConverter,
)

View file

@ -2,31 +2,7 @@ from dataclasses import dataclass, asdict
from typing import Optional
@dataclass(kw_only=True, frozen=True)
@dataclass
class StreamInfo:
"""The StreamInfo class is used to store information about a file stream.
All fields can be None, and will depend on how the stream was opened.
"""
mimetype: Optional[str] = None
extension: Optional[str] = None
charset: Optional[str] = None
filename: Optional[
str
] = None # From local path, url, or Content-Disposition header
local_path: Optional[str] = None # If read from disk
url: Optional[str] = None # If read from url
def copy_and_update(self, *args, **kwargs):
"""Copy the StreamInfo object and update it with the given StreamInfo
instance and/or other keyword arguments."""
new_info = asdict(self)
for si in args:
assert isinstance(si, StreamInfo)
new_info.update({k: v for k, v in asdict(si).items() if v is not None})
if len(kwargs) > 0:
new_info.update(kwargs)
return StreamInfo(**new_info)
magic_type: Optional[str] = None
category: Optional[str] = None

View file

@ -0,0 +1,102 @@
import os
from io import BytesIO
from markitup._stream_info import StreamInfo
import magic
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
"""
Reads all files from the specified folder into BytesIO objects.
Args:
folder_path (str): Path to the folder containing files
Returns:
dict: Dictionary with filenames as keys and BytesIO objects as values
"""
byte_streams = {}
# Check if folder exists
if not os.path.exists(folder_path):
raise FileNotFoundError(f"Folder '{folder_path}' not found")
# Iterate through all files in the folder
for filename in sorted(os.listdir(folder_path)):
file_path = os.path.join(folder_path, filename)
# Check if it's a file (not a subdirectory)
if os.path.isfile(file_path):
# Read file in binary mode
with open(file_path, "rb") as f:
# Create BytesIO object with file content
file_bytes = BytesIO(f.read())
# Add to dictionary with filename as key
byte_streams[filename] = file_bytes
# Reset BytesIO position to beginning
file_bytes.seek(0)
return byte_streams
def detect_file_types(file_dict):
"""
Detects file types for a dictionary of {filename: BytesIO} pairs
using only magic type (content-based detection)
Args:
file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values
Returns:
dict: Dictionary with filenames as keys and file type information as values
"""
result = {}
for filename, byte_stream in file_dict.items():
# Get the original position to reset later
original_position = byte_stream.tell()
# Reset stream position to beginning
byte_stream.seek(0)
# Get file content for analysis
file_content = byte_stream.read()
# Use python-magic to determine file type based on content
magic_type = magic.from_buffer(file_content, mime=True)
# Determine file category based on magic_type
if magic_type.startswith("image/"):
category = "image"
elif magic_type.startswith("audio/"):
category = "audio"
elif magic_type.startswith("video/"):
category = "video"
elif (
magic_type.startswith("application/vnd.ms-excel")
or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
category = "xls"
elif (
magic_type.startswith("application/vnd.ms-powerpoint")
or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
category = "ppt"
elif (
magic_type.startswith("application/msword")
or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
category = "doc"
elif magic_type == "application/pdf":
category = "pdf"
elif magic_type.startswith("text/"):
category = "text"
else:
category = "other"
# Store the results
result[filename] = StreamInfo(magic_type=magic_type, category=category)
# Reset stream position
byte_stream.seek(original_position)
return result

View file

@ -8,13 +8,15 @@ from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._audio_converter import AudioConverter
# from ._audio_converter import AudioConverter
from ._csv_converter import CsvConverter
from ._markdownify import _CustomMarkdownify
__all__ = [
"PlainTextConverter",
"HtmlConverter",
"RssConverter",
"_CustomMarkdownify",
"WikipediaConverter",
"YouTubeConverter",
"IpynbConverter",
@ -25,7 +27,7 @@ __all__ = [
"XlsConverter",
"PptxConverter",
"ImageConverter",
"AudioConverter",
# "AudioConverter",
"OutlookMsgConverter",
"ZipConverter",
"DocumentIntelligenceConverter",

View file

@ -2,7 +2,6 @@ import io
from typing import Any, BinaryIO, Optional
from ._exiftool import exiftool_metadata
from ._transcribe_audio import transcribe_audio
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException

View file

@ -6,12 +6,12 @@ from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
ACCEPTED_MIME_TYPE_PREFIXES = [
ACCEPTED_MAGIC_TYPE_PREFIXES = [
"text/html",
"application/xhtml",
]
ACCEPTED_FILE_EXTENSIONS = [
ACCEPTED_FILE_CATEGORY = [
".html",
".htm",
]
@ -26,14 +26,14 @@ class HtmlConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
magic_type = (stream_info.magic_type or "").lower()
category = (stream_info.category or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
if category in ACCEPTED_FILE_CATEGORY:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
if magic_type.startswith(prefix):
return True
return False
@ -45,7 +45,7 @@ class HtmlConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
encoding = "utf-8"
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks
@ -81,10 +81,8 @@ class HtmlConverter(DocumentConverter):
return self.convert(
file_stream=io.BytesIO(html_content.encode("utf-8")),
stream_info=StreamInfo(
mimetype="text/html",
extension=".html",
charset="utf-8",
url=url,
magic_type="text/html",
category="text",
),
**kwargs,
)

View file

@ -0,0 +1,111 @@
import re
import markdownify
from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse
class _CustomMarkdownify(markdownify.MarkdownConverter):
"""
A custom version of markdownify's MarkdownConverter. Changes include:
- Altering the default heading style to use '#', '##', etc.
- Removing javascript hyperlinks.
- Truncating images with large data:uri sources.
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
"""
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line"""
if not convert_as_inline:
if not re.search(r"^\n", text):
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
):
"""Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
return ""
if el.find_parent("pre") is not None:
return text
href = el.get("href")
title = el.get("title")
# Escape URIs and skip non-http or file schemes
if href:
try:
parsed_url = urlparse(href) # type: ignore
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
return "%s%s%s" % (prefix, text, suffix)
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
except ValueError: # It's not clear if this ever gets thrown
return "%s%s%s" % (prefix, text, suffix)
# For the replacement see #29: text nodes underscores are escaped
if (
self.options["autolinks"]
and text.replace(r"\_", "_") == href
and not title
and not self.options["default_title"]
):
# Shortcut syntax
return "<%s>" % href
if self.options["default_title"] and not title:
title = href
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
return (
"%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
if href
else text
)
def convert_img(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or ""
src = el.attrs.get("src", None) or ""
title = el.attrs.get("title", None) or ""
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
if (
convert_as_inline
and el.parent.name not in self.options["keep_inline_images_in"]
):
return alt
# Remove dataURIs
if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."
return "![%s](%s%s)" % (alt, src, title_part)
def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore

View file

@ -11,23 +11,14 @@ from operator import attrgetter
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pptx
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
import pptx
ACCEPTED_MIME_TYPE_PREFIXES = [
ACCEPTED_MAGIC_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.presentationml",
]
ACCEPTED_FILE_EXTENSIONS = [".pptx"]
ACCEPTED_FILE_CATEGORY = [".pptx"]
class PptxConverter(DocumentConverter):
@ -45,14 +36,14 @@ class PptxConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
magic_type = (stream_info.magic_type or "").lower()
category = (stream_info.category or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
if category in ACCEPTED_FILE_CATEGORY:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
for prefix in ACCEPTED_MAGIC_TYPE_PREFIXES:
if magic_type.startswith(prefix):
return True
return False
@ -63,19 +54,6 @@ class PptxConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pptx",
feature="pptx",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
# Perform the conversion
presentation = pptx.Presentation(file_stream)
@ -109,15 +87,12 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"\s+", " ", alt_text).strip()
# If keep_data_uris is True, use base64 encoding for images
if kwargs.get("keep_data_uris", False):
blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
else:
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += "\n![" + alt_text + "](" + filename + ")\n"
blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
# Tables
if self._is_table(shape):

View file

@ -505,92 +505,49 @@ wheels = [
name = "markitup"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "charset-normalizer" },
{ name = "magika" },
{ name = "markdownify" },
{ name = "requests" },
]
[package.optional-dependencies]
all = [
{ name = "azure-ai-documentintelligence" },
{ name = "azure-identity" },
{ name = "beautifulsoup4" },
{ name = "charset-normalizer" },
{ name = "lxml" },
{ name = "magika" },
{ name = "mammoth" },
{ name = "markdownify" },
{ name = "olefile" },
{ name = "openpyxl" },
{ name = "pandas" },
{ name = "pdfminer-six" },
{ name = "pydub" },
{ name = "python-magic" },
{ name = "python-pptx" },
{ name = "requests" },
{ name = "speechrecognition" },
{ name = "xlrd" },
{ name = "youtube-transcript-api" },
]
audio-transcription = [
{ name = "pydub" },
{ name = "speechrecognition" },
]
docx = [
{ name = "lxml" },
{ name = "mammoth" },
]
outlook = [
{ name = "olefile" },
]
pdf = [
{ name = "pdfminer-six" },
]
pptx = [
{ name = "python-pptx" },
]
xls = [
{ name = "pandas" },
{ name = "xlrd" },
]
xlsx = [
{ name = "openpyxl" },
{ name = "pandas" },
]
youtube-transcription = [
{ name = "youtube-transcript-api" },
]
[package.metadata]
requires-dist = [
{ name = "azure-ai-documentintelligence", marker = "extra == 'all'" },
{ name = "azure-identity", marker = "extra == 'all'" },
{ name = "azure-ai-documentintelligence" },
{ name = "azure-identity" },
{ name = "beautifulsoup4" },
{ name = "charset-normalizer" },
{ name = "lxml", marker = "extra == 'all'" },
{ name = "lxml", marker = "extra == 'docx'" },
{ name = "lxml" },
{ name = "magika", specifier = "~=0.6.1" },
{ name = "mammoth", marker = "extra == 'all'" },
{ name = "mammoth", marker = "extra == 'docx'" },
{ name = "mammoth" },
{ name = "markdownify" },
{ name = "olefile", marker = "extra == 'all'" },
{ name = "olefile", marker = "extra == 'outlook'" },
{ name = "openpyxl", marker = "extra == 'all'" },
{ name = "openpyxl", marker = "extra == 'xlsx'" },
{ name = "pandas", marker = "extra == 'all'" },
{ name = "pandas", marker = "extra == 'xls'" },
{ name = "pandas", marker = "extra == 'xlsx'" },
{ name = "pdfminer-six", marker = "extra == 'all'" },
{ name = "pdfminer-six", marker = "extra == 'pdf'" },
{ name = "pydub", marker = "extra == 'all'" },
{ name = "pydub", marker = "extra == 'audio-transcription'" },
{ name = "python-pptx", marker = "extra == 'all'" },
{ name = "python-pptx", marker = "extra == 'pptx'" },
{ name = "olefile" },
{ name = "openpyxl" },
{ name = "pandas" },
{ name = "pdfminer-six" },
{ name = "pydub" },
{ name = "python-magic", specifier = ">=0.4.27" },
{ name = "python-pptx" },
{ name = "requests" },
{ name = "speechrecognition", marker = "extra == 'all'" },
{ name = "speechrecognition", marker = "extra == 'audio-transcription'" },
{ name = "xlrd", marker = "extra == 'all'" },
{ name = "xlrd", marker = "extra == 'xls'" },
{ name = "youtube-transcript-api", marker = "extra == 'all'", specifier = "~=1.0.0" },
{ name = "youtube-transcript-api", marker = "extra == 'youtube-transcription'" },
{ name = "speechrecognition" },
{ name = "xlrd" },
{ name = "youtube-transcript-api", specifier = "~=1.0.0" },
]
provides-extras = ["all", "audio-transcription", "docx", "outlook", "pdf", "pptx", "xls", "xlsx", "youtube-transcription"]
[[package]]
name = "mpmath"
@ -966,6 +923,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256 },
]
[[package]]
name = "python-magic"
version = "0.4.27"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/da/db/0b3e28ac047452d079d375ec6798bf76a036a08182dbb39ed38116a49130/python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b", size = 14677 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3", size = 13840 },
]
[[package]]
name = "python-pptx"
version = "1.0.2"