Merge pull request #2 from pathintegral-institute/rong/tech-139-modality-conversion

Rong/tech 139 modality conversion
This commit is contained in:
rong-xyz 2025-04-22 19:30:24 +08:00 committed by GitHub
commit cd85971867
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 257 additions and 470 deletions

View file

@ -32,8 +32,6 @@ dependencies = [
"python-pptx", "python-pptx",
"mammoth", "mammoth",
"pandas", "pandas",
"openpyxl",
"xlrd",
"lxml", "lxml",
"olefile", "olefile",
"pydub", "pydub",

View file

@ -7,9 +7,8 @@ from ._markitup import (
MarkItUp, MarkItUp,
) )
from ._base_converter import DocumentConverterResult, DocumentConverter from ._base_converter import DocumentConverterResult, DocumentConverter
from ._stream_info import StreamInfo from ._schemas import StreamInfo, Config
from ._exceptions import ( from ._exceptions import (
MarkItUpException,
MissingDependencyException, MissingDependencyException,
FailedConversionAttempt, FailedConversionAttempt,
FileConversionException, FileConversionException,
@ -21,10 +20,10 @@ __all__ = [
"MarkItUp", "MarkItUp",
"DocumentConverter", "DocumentConverter",
"DocumentConverterResult", "DocumentConverterResult",
"MarkItUpException",
"MissingDependencyException", "MissingDependencyException",
"FailedConversionAttempt", "FailedConversionAttempt",
"FileConversionException", "FileConversionException",
"UnsupportedFormatException", "UnsupportedFormatException",
"StreamInfo", "StreamInfo",
"Config"
] ]

View file

@ -2,8 +2,9 @@ import os
import tempfile import tempfile
from warnings import warn from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List, Dict from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._stream_info import StreamInfo from ._schemas import StreamInfo
import re import re
import base64
class DocumentConverterResult: class DocumentConverterResult:
@ -11,9 +12,11 @@ class DocumentConverterResult:
def __init__( def __init__(
self, self,
markdown: str, markdown: str = "",
*, *,
title: Optional[str] = None, title: Optional[str] = None,
audio_stream: Optional[BinaryIO] = None,
stream_info: Optional[StreamInfo] = None,
): ):
""" """
Initialize the DocumentConverterResult. Initialize the DocumentConverterResult.
@ -26,7 +29,9 @@ class DocumentConverterResult:
- title: Optional title of the document. - title: Optional title of the document.
""" """
self.markdown = markdown self.markdown = markdown
self.audio_stream = audio_stream
self.title = title self.title = title
self.stream_info = stream_info
def to_llm(self) -> List[Dict[str, Any]]: def to_llm(self) -> List[Dict[str, Any]]:
""" """
@ -40,7 +45,6 @@ class DocumentConverterResult:
(text and images) in their original order. (text and images) in their original order.
""" """
# Pattern to match markdown image syntax with base64 data # Pattern to match markdown image syntax with base64 data
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)' pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
@ -80,7 +84,14 @@ class DocumentConverterResult:
"type": "text", "type": "text",
"text": text_chunk "text": text_chunk
}) })
if self.audio_stream:
audio_b64 = base64.b64encode(
self.audio_stream.read()).decode('utf-8')
content.append({
"type": "media",
"mime_type": self.stream_info.magic_type,
"data": audio_b64
})
return content return content
@property @property
@ -105,7 +116,7 @@ class DocumentConverter:
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter ** kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
""" """
Convert a document to Markdown text. Convert a document to Markdown text.

View file

@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential
* etc.""" * etc."""
class MarkItUpException(Exception): class MissingDependencyException(Exception):
"""
Base exception class for MarkItUp.
"""
pass
class MissingDependencyException(MarkItUpException):
""" """
Converters shipped with MarkItUp may depend on optional Converters shipped with MarkItUp may depend on optional
dependencies. This exception is thrown when a converter's dependencies. This exception is thrown when a converter's
@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException):
pass pass
class UnsupportedFormatException(MarkItUpException): class UnsupportedFormatException(Exception):
""" """
Thrown when no suitable converter was found for the given file. Thrown when no suitable converter was found for the given file.
""" """
@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException):
pass pass
class FailedConversionAttempt(object): class FailedConversionAttempt(Exception):
""" """
Represents an a single attempt to convert a file. Represents a single attempt to convert a file.
""" """
def __init__(self, converter: Any, exc_info: Optional[tuple] = None): def __init__(self):
self.converter = converter super().__init__(f"Conversion attempt failed!")
self.exc_info = exc_info
class FileConversionException(MarkItUpException): class FileConversionException(Exception):
""" """
Thrown when a suitable converter was found, but the conversion Thrown when a suitable converter was found, but the conversion
process fails for any reason. process fails for any reason.

View file

@ -4,7 +4,7 @@ from urllib.parse import urlparse
from warnings import warn from warnings import warn
import magic import magic
from ._stream_info import StreamInfo from ._schemas import StreamInfo, Config
from .converters import ( from .converters import (
PlainTextConverter, PlainTextConverter,
@ -14,7 +14,7 @@ from .converters import (
XlsxConverter, XlsxConverter,
XlsConverter, XlsConverter,
PptxConverter, PptxConverter,
# AudioConverter, AudioConverter,
CsvConverter, CsvConverter,
) )
@ -33,30 +33,42 @@ class MarkItUp:
def __init__( def __init__(
self, self,
config: Optional[Dict[str, Any]] = None, config: Config = Config(),
): ):
self.config = config self.config = config
def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]: def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
stream_info: StreamInfo = self._get_stream_info(stream) stream_info: StreamInfo = self._get_stream_info(stream)
# Deal with unsupported file types # Deal with unsupported file types
match stream_info.category:
case "ppt":
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
case "other":
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
try: try:
match stream_info.category: match stream_info.category:
case "text": case "text":
return PlainTextConverter().convert(stream, stream_info), stream_info return PlainTextConverter().convert(stream, stream_info), stream_info
case "pptx": case "pptx":
return PptxConverter().convert(stream, stream_info), stream_info return PptxConverter(config=self.config).convert(stream, stream_info), stream_info
case "pdf": case "pdf":
return PdfConverter().convert(stream, stream_info), stream_info return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
case "audio":
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
case "xlsx":
return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info
case "xls":
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
case "csv":
return CsvConverter().convert(stream, stream_info), stream_info
case "docx":
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
case _:
match stream_info.category:
case "ppt":
raise UnsupportedFormatException(
".ppt files are not supported, try .pptx instead")
case "other":
raise UnsupportedFormatException(
f"{stream_info.magic_type} files are not supported")
except FailedConversionAttempt: except FailedConversionAttempt:
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}") raise FileConversionException(
return stream_info f"Failed to convert file of type {stream_info.magic_type}")
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo: def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
original_position = byte_stream.tell() original_position = byte_stream.tell()
@ -91,7 +103,12 @@ class MarkItUp:
category = "docx" category = "docx"
elif magic_type == "application/pdf": elif magic_type == "application/pdf":
category = "pdf" category = "pdf"
elif magic_type == "application/csv":
category = "csv"
elif magic_type.startswith("text/"): elif magic_type.startswith("text/"):
if magic_type == "text/csv":
category = "csv"
else:
category = "text" category = "text"
else: else:
category = "other" category = "other"

View file

@ -0,0 +1,15 @@
from dataclasses import dataclass, asdict, field
from typing import Optional, List, Literal
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None
@dataclass
class Config:
modalities: List[Literal["image", "audio"]] = field(
default_factory=lambda: ["image", "audio"]
)

View file

@ -1,8 +0,0 @@
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None

View file

@ -1,7 +1,11 @@
import os import os
from io import BytesIO from io import BytesIO
from markitup._stream_info import StreamInfo from markitup._schemas import StreamInfo
import magic import magic
import speech_recognition as sr
import pydub
import io
from typing import BinaryIO
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"): def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
@ -38,65 +42,23 @@ def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
return byte_streams return byte_streams
def detect_file_types(file_dict): def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
""" audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None
Detects file types for a dictionary of {filename: BytesIO} pairs
using only magic type (content-based detection)
Args: match audio_format:
file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values case 'mp3':
audio_segment = pydub.AudioSegment.from_file(
file_stream, format=audio_format)
audio_source = io.BytesIO()
audio_segment.export(audio_source, format="wav")
audio_source.seek(0)
case 'wav':
audio_source = file_stream
case _:
raise ValueError(f"Unsupported audio format: {magic_type}")
Returns: recognizer = sr.Recognizer()
dict: Dictionary with filenames as keys and file type information as values with sr.AudioFile(audio_source) as source:
""" audio = recognizer.record(source)
result = {} transcript = recognizer.recognize_google(audio).strip()
return "[No speech detected]" if transcript == "" else transcript
for filename, byte_stream in file_dict.items():
# Get the original position to reset later
original_position = byte_stream.tell()
# Reset stream position to beginning
byte_stream.seek(0)
# Get file content for analysis
file_content = byte_stream.read()
# Use python-magic to determine file type based on content
magic_type = magic.from_buffer(file_content, mime=True)
# Determine file category based on magic_type
if magic_type.startswith("image/"):
category = "image"
elif magic_type.startswith("audio/"):
category = "audio"
elif magic_type.startswith("video/"):
category = "video"
elif (
magic_type.startswith("application/vnd.ms-excel")
or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
category = "xls"
elif (
magic_type.startswith("application/vnd.ms-powerpoint")
or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
category = "ppt"
elif (
magic_type.startswith("application/msword")
or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
category = "doc"
elif magic_type == "application/pdf":
category = "pdf"
elif magic_type.startswith("text/"):
category = "text"
else:
category = "other"
# Store the results
result[filename] = StreamInfo(magic_type=magic_type, category=category)
# Reset stream position
byte_stream.seek(original_position)
return result

View file

@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter from ._docx_converter import DocxConverter
from ._xlsx_converter import XlsxConverter, XlsConverter from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter from ._pptx_converter import PptxConverter
# from ._audio_converter import AudioConverter from ._audio_converter import AudioConverter
from ._csv_converter import CsvConverter from ._csv_converter import CsvConverter
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
@ -27,7 +27,7 @@ __all__ = [
"XlsConverter", "XlsConverter",
"PptxConverter", "PptxConverter",
"ImageConverter", "ImageConverter",
# "AudioConverter", "AudioConverter",
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",

View file

@ -1,23 +1,10 @@
import io import io
from typing import Any, BinaryIO, Optional from typing import Any, BinaryIO, Optional, Tuple
from ._exiftool import exiftool_metadata
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo, Config
from .._exceptions import MissingDependencyException from .._exceptions import MissingDependencyException
from ..converter_utils.utils import transcribe_audio
ACCEPTED_MIME_TYPE_PREFIXES = [
"audio/x-wav",
"audio/mpeg",
"video/mp4",
]
ACCEPTED_FILE_EXTENSIONS = [
".wav",
".mp3",
".m4a",
".mp4",
]
class AudioConverter(DocumentConverter): class AudioConverter(DocumentConverter):
@ -25,77 +12,25 @@ class AudioConverter(DocumentConverter):
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
""" """
def accepts( def __init__(self, config: Config):
self, self.config = config
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter ** kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> Tuple[DocumentConverterResult, StreamInfo]:
md_content = "" md_content = ""
# Add metadata
metadata = exiftool_metadata(
file_stream, exiftool_path=kwargs.get("exiftool_path")
)
if metadata:
for f in [
"Title",
"Artist",
"Author",
"Band",
"Album",
"Genre",
"Track",
"DateTimeOriginal",
"CreateDate",
# "Duration", -- Wrong values when read from memory
"NumChannels",
"SampleRate",
"AvgBytesPerSec",
"BitsPerSample",
]:
if f in metadata:
md_content += f"{f}: {metadata[f]}\n"
# Figure out the audio format for transcription
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
audio_format = "wav"
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
audio_format = "mp3"
elif (
stream_info.extension in [".mp4", ".m4a"]
or stream_info.mimetype == "video/mp4"
):
audio_format = "mp4"
else:
audio_format = None
# Transcribe # Transcribe
if audio_format: if 'audio' not in self.config.modalities:
try: transcript = transcribe_audio(
transcript = transcribe_audio(file_stream, audio_format=audio_format) file_stream, magic_type=stream_info.magic_type)
if transcript: if transcript:
md_content += "\n\n### Audio Transcript:\n" + transcript md_content += "\n\n### Audio Transcript:\n" + transcript
except MissingDependencyException: return DocumentConverterResult(markdown=md_content.strip())
pass else:
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
# Return the result # Return the result
return DocumentConverterResult(markdown=md_content.strip())

View file

@ -3,15 +3,8 @@ import csv
import io import io
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv",
"application/csv",
]
ACCEPTED_FILE_EXTENSIONS = [".csv"]
class CsvConverter(DocumentConverter): class CsvConverter(DocumentConverter):
@ -19,24 +12,6 @@ class CsvConverter(DocumentConverter):
Converts CSV files to Markdown tables. Converts CSV files to Markdown tables.
""" """
def __init__(self):
super().__init__()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
@ -44,9 +19,6 @@ class CsvConverter(DocumentConverter):
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Read the file content # Read the file content
if stream_info.charset:
content = file_stream.read().decode(stream_info.charset)
else:
content = str(from_bytes(file_stream.read()).best()) content = str(from_bytes(file_stream.read()).best())
# Parse CSV content # Parse CSV content

View file

@ -5,24 +5,8 @@ from typing import BinaryIO, Any
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo, Config
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE import mammoth
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import mammoth
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
ACCEPTED_FILE_EXTENSIONS = [".docx"]
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):
@ -30,27 +14,8 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
""" """
def __init__(self): def __init__(self, config: Config):
super().__init__() self._html_converter = HtmlConverter(config=config)
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
@ -58,23 +23,11 @@ class DocxConverter(HtmlConverter):
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".docx",
feature="docx",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream) pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string( return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, mammoth.convert_to_html(
pre_process_stream, style_map=style_map).value,
**kwargs, **kwargs,
) )

View file

@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo, Config
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
ACCEPTED_MAGIC_TYPE_PREFIXES = [ ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -19,6 +19,10 @@ ACCEPTED_FILE_CATEGORY = [
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
"""Anything with content type text/html""" """Anything with content type text/html"""
def __init__(self, config: Config):
self.config = config
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
@ -27,7 +31,8 @@ class HtmlConverter(DocumentConverter):
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Parse the stream # Parse the stream
encoding = "utf-8" encoding = "utf-8"
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) soup = BeautifulSoup(file_stream, "html.parser",
from_encoding=encoding)
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):
@ -37,15 +42,17 @@ class HtmlConverter(DocumentConverter):
body_elm = soup.find("body") body_elm = soup.find("body")
webpage_text = "" webpage_text = ""
if body_elm: if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) webpage_text = _CustomMarkdownify(
config=self.config, **kwargs).convert_soup(body_elm)
else: else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) webpage_text = _CustomMarkdownify(
config=self.config, **kwargs).convert_soup(soup)
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)
# remove leading and trailing \n # remove leading and trailing \n
webpage_text = webpage_text.strip() webpage_text = webpage_text.strip()
print(webpage_text)
return DocumentConverterResult( return DocumentConverterResult(
markdown=webpage_text, markdown=webpage_text,
title=None if soup.title is None else soup.title.string, title=None if soup.title is None else soup.title.string,

View file

@ -3,6 +3,7 @@ import markdownify
from typing import Any, Optional from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse from urllib.parse import quote, unquote, urlparse, urlunparse
from .._schemas import Config
class _CustomMarkdownify(markdownify.MarkdownConverter): class _CustomMarkdownify(markdownify.MarkdownConverter):
@ -15,11 +16,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
""" """
def __init__(self, **options: Any): def __init__(self, config: Config, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX) options["heading_style"] = options.get(
"heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False) options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
self.config = config
def convert_hn( def convert_hn(
self, self,
@ -58,9 +61,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
if href: if href:
try: try:
parsed_url = urlparse(href) # type: ignore parsed_url = urlparse(href) # type: ignore
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore # type: ignore
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:
return "%s%s%s" % (prefix, text, suffix) return "%s%s%s" % (prefix, text, suffix)
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore href = urlunparse(parsed_url._replace(
path=quote(unquote(parsed_url.path)))) # type: ignore
except ValueError: # It's not clear if this ever gets thrown except ValueError: # It's not clear if this ever gets thrown
return "%s%s%s" % (prefix, text, suffix) return "%s%s%s" % (prefix, text, suffix)
@ -95,17 +100,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
src = el.attrs.get("src", None) or "" src = el.attrs.get("src", None) or ""
title = el.attrs.get("title", None) or "" title = el.attrs.get("title", None) or ""
title_part = ' "%s"' % title.replace('"', r"\"") if title else "" title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
if (
convert_as_inline
and el.parent.name not in self.options["keep_inline_images_in"]
):
return alt
# Remove dataURIs
if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."
if "image" in self.config.modalities:
return "![%s](%s%s)" % (alt, src, title_part) return "![%s](%s%s)" % (alt, src, title_part)
else:
return alt
def convert_soup(self, soup: Any) -> str: def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore return super().convert_soup(soup) # type: ignore

View file

@ -3,7 +3,7 @@ import io
import base64 import base64
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo, Config
import fitz import fitz
@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown with embedded images. Converts PDFs to Markdown with embedded images.
""" """
def __init__(self, config: Config):
self.config = config
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
@ -36,7 +39,7 @@ class PdfConverter(DocumentConverter):
# Extract images from the page # Extract images from the page
image_list = page.get_images(full=True) image_list = page.get_images(full=True)
if 'image' in self.config.modalities:
for img_index, img_info in enumerate(image_list): for img_index, img_info in enumerate(image_list):
xref = img_info[0] # Get the image reference xref = img_info[0] # Get the image reference
base_image = doc.extract_image(xref) base_image = doc.extract_image(xref)
@ -47,16 +50,17 @@ class PdfConverter(DocumentConverter):
try: try:
# Convert image to base64 for markdown embedding # Convert image to base64 for markdown embedding
img_base64 = base64.b64encode(image_bytes).decode('utf-8') img_base64 = base64.b64encode(
image_bytes).decode('utf-8')
# Add image to markdown with a unique identifier # Add image to markdown with a unique identifier
image_count += 1 image_count += 1
markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
except Exception as e: except Exception as e:
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
else:
markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
# Close the document to free resources # Close the document to free resources
doc.close() doc.close()
print(markdown_content)
return DocumentConverterResult( return DocumentConverterResult(
markdown=markdown_content, markdown=markdown_content,
) )

View file

@ -1,11 +1,12 @@
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain""" """Anything with content type text/plain"""
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,

View file

@ -10,7 +10,7 @@ from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo, Config
import pptx import pptx
@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text. Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
""" """
def __init__(self): def __init__(self, config: Config):
super().__init__() self._html_converter = HtmlConverter(config=config)
self._html_converter = HtmlConverter() self.config = config
def convert( def convert(
self, self,
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
# Also grab any description embedded in the deck # Also grab any description embedded in the deck
try: try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
"descr", "")
except Exception: except Exception:
# Unable to get alt text # Unable to get alt text
pass pass
@ -69,16 +70,20 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"\s+", " ", alt_text).strip() alt_text = re.sub(r"\s+", " ", alt_text).strip()
# If keep_data_uris is True, use base64 encoding for images # If keep_data_uris is True, use base64 encoding for images
if 'image' in self.config.modalities:
blob = shape.image.blob blob = shape.image.blob
content_type = shape.image.content_type or "image/png" content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8") b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
else:
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += "\n![" + alt_text + \
"](" + filename + ")\n"
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table, **kwargs) md_content += self._convert_table_to_markdown(
shape.table, **kwargs)
# Charts # Charts
if shape.has_chart: if shape.has_chart:
@ -93,7 +98,8 @@ class PptxConverter(DocumentConverter):
# Group Shapes # Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) sorted_shapes = sorted(
shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes: for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs) get_shape_content(subshape, **kwargs)
@ -141,7 +147,8 @@ class PptxConverter(DocumentConverter):
html_table += "</table></body></html>" html_table += "</table></body></html>"
return ( return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip() self._html_converter.convert_string(
html_table, **kwargs).markdown.strip()
+ "\n" + "\n"
) )

View file

@ -1,36 +1,8 @@
import sys
from typing import BinaryIO, Any from typing import BinaryIO, Any
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._schemas import StreamInfo, Config
from .._stream_info import StreamInfo import pandas as pd
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()
_xls_dependency_exc_info = None
try:
import pandas as pd
import xlrd
except ImportError:
_xls_dependency_exc_info = sys.exc_info()
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
]
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
"application/vnd.ms-excel",
"application/excel",
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
class XlsxConverter(DocumentConverter): class XlsxConverter(DocumentConverter):
@ -38,27 +10,8 @@ class XlsxConverter(DocumentConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
""" """
def __init__(self): def __init__(self, config: Config):
super().__init__() self._html_converter = HtmlConverter(config=config)
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
@ -66,20 +19,6 @@ class XlsxConverter(DocumentConverter):
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Check the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xlsx",
feature="xlsx",
)
) from _xlsx_dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_xlsx_dependency_exc_info[2]
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = "" md_content = ""
for s in sheets: for s in sheets:
@ -100,27 +39,8 @@ class XlsConverter(DocumentConverter):
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
""" """
def __init__(self): def __init__(self, config: Config):
super().__init__() self._html_converter = HtmlConverter(config=config)
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
@ -128,19 +48,6 @@ class XlsConverter(DocumentConverter):
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Load the dependencies
if _xls_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xls",
feature="xls",
)
) from _xls_dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
_xls_dependency_exc_info[2]
)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd") sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
md_content = "" md_content = ""

View file

@ -0,0 +1,51 @@
ID,Name,Age,Country,Email
1,Name_1,62,Country_1,email_1@example.com
2,Name_2,48,Country_2,email_2@example.com
3,Name_3,61,Country_3,email_3@example.com
4,Name_4,32,Country_4,email_4@example.com
5,Name_5,69,Country_5,email_5@example.com
6,Name_6,32,Country_6,email_6@example.com
7,Name_7,62,Country_7,email_7@example.com
8,Name_8,39,Country_8,email_8@example.com
9,Name_9,40,Country_9,email_9@example.com
10,Name_10,32,Country_0,email_10@example.com
11,Name_11,24,Country_1,email_11@example.com
12,Name_12,45,Country_2,email_12@example.com
13,Name_13,39,Country_3,email_13@example.com
14,Name_14,18,Country_4,email_14@example.com
15,Name_15,66,Country_5,email_15@example.com
16,Name_16,48,Country_6,email_16@example.com
17,Name_17,60,Country_7,email_17@example.com
18,Name_18,31,Country_8,email_18@example.com
19,Name_19,43,Country_9,email_19@example.com
20,Name_20,33,Country_0,email_20@example.com
21,Name_21,32,Country_1,email_21@example.com
22,Name_22,68,Country_2,email_22@example.com
23,Name_23,44,Country_3,email_23@example.com
24,Name_24,32,Country_4,email_24@example.com
25,Name_25,33,Country_5,email_25@example.com
26,Name_26,46,Country_6,email_26@example.com
27,Name_27,38,Country_7,email_27@example.com
28,Name_28,50,Country_8,email_28@example.com
29,Name_29,68,Country_9,email_29@example.com
30,Name_30,66,Country_0,email_30@example.com
31,Name_31,60,Country_1,email_31@example.com
32,Name_32,53,Country_2,email_32@example.com
33,Name_33,30,Country_3,email_33@example.com
34,Name_34,30,Country_4,email_34@example.com
35,Name_35,43,Country_5,email_35@example.com
36,Name_36,44,Country_6,email_36@example.com
37,Name_37,31,Country_7,email_37@example.com
38,Name_38,35,Country_8,email_38@example.com
39,Name_39,56,Country_9,email_39@example.com
40,Name_40,35,Country_0,email_40@example.com
41,Name_41,62,Country_1,email_41@example.com
42,Name_42,63,Country_2,email_42@example.com
43,Name_43,51,Country_3,email_43@example.com
44,Name_44,52,Country_4,email_44@example.com
45,Name_45,66,Country_5,email_45@example.com
46,Name_46,69,Country_6,email_46@example.com
47,Name_47,68,Country_7,email_47@example.com
48,Name_48,68,Country_8,email_48@example.com
49,Name_49,69,Country_9,email_49@example.com
50,Name_50,46,Country_0,email_50@example.com
1 ID Name Age Country Email
2 1 Name_1 62 Country_1 email_1@example.com
3 2 Name_2 48 Country_2 email_2@example.com
4 3 Name_3 61 Country_3 email_3@example.com
5 4 Name_4 32 Country_4 email_4@example.com
6 5 Name_5 69 Country_5 email_5@example.com
7 6 Name_6 32 Country_6 email_6@example.com
8 7 Name_7 62 Country_7 email_7@example.com
9 8 Name_8 39 Country_8 email_8@example.com
10 9 Name_9 40 Country_9 email_9@example.com
11 10 Name_10 32 Country_0 email_10@example.com
12 11 Name_11 24 Country_1 email_11@example.com
13 12 Name_12 45 Country_2 email_12@example.com
14 13 Name_13 39 Country_3 email_13@example.com
15 14 Name_14 18 Country_4 email_14@example.com
16 15 Name_15 66 Country_5 email_15@example.com
17 16 Name_16 48 Country_6 email_16@example.com
18 17 Name_17 60 Country_7 email_17@example.com
19 18 Name_18 31 Country_8 email_18@example.com
20 19 Name_19 43 Country_9 email_19@example.com
21 20 Name_20 33 Country_0 email_20@example.com
22 21 Name_21 32 Country_1 email_21@example.com
23 22 Name_22 68 Country_2 email_22@example.com
24 23 Name_23 44 Country_3 email_23@example.com
25 24 Name_24 32 Country_4 email_24@example.com
26 25 Name_25 33 Country_5 email_25@example.com
27 26 Name_26 46 Country_6 email_26@example.com
28 27 Name_27 38 Country_7 email_27@example.com
29 28 Name_28 50 Country_8 email_28@example.com
30 29 Name_29 68 Country_9 email_29@example.com
31 30 Name_30 66 Country_0 email_30@example.com
32 31 Name_31 60 Country_1 email_31@example.com
33 32 Name_32 53 Country_2 email_32@example.com
34 33 Name_33 30 Country_3 email_33@example.com
35 34 Name_34 30 Country_4 email_34@example.com
36 35 Name_35 43 Country_5 email_35@example.com
37 36 Name_36 44 Country_6 email_36@example.com
38 37 Name_37 31 Country_7 email_37@example.com
39 38 Name_38 35 Country_8 email_38@example.com
40 39 Name_39 56 Country_9 email_39@example.com
41 40 Name_40 35 Country_0 email_40@example.com
42 41 Name_41 62 Country_1 email_41@example.com
43 42 Name_42 63 Country_2 email_42@example.com
44 43 Name_43 51 Country_3 email_43@example.com
45 44 Name_44 52 Country_4 email_44@example.com
46 45 Name_45 66 Country_5 email_45@example.com
47 46 Name_46 69 Country_6 email_46@example.com
48 47 Name_47 68 Country_7 email_47@example.com
49 48 Name_48 68 Country_8 email_48@example.com
50 49 Name_49 69 Country_9 email_49@example.com
51 50 Name_50 46 Country_0 email_50@example.com

BIN
packages/markitup/tests/test_files/test.docx Executable file → Normal file

Binary file not shown.

View file

@ -173,15 +173,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 }, { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
] ]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 },
]
[[package]] [[package]]
name = "flatbuffers" name = "flatbuffers"
version = "25.2.10" version = "25.2.10"
@ -348,7 +339,6 @@ dependencies = [
{ name = "mammoth" }, { name = "mammoth" },
{ name = "markdownify" }, { name = "markdownify" },
{ name = "olefile" }, { name = "olefile" },
{ name = "openpyxl" },
{ name = "pandas" }, { name = "pandas" },
{ name = "pydub" }, { name = "pydub" },
{ name = "pymupdf" }, { name = "pymupdf" },
@ -356,7 +346,6 @@ dependencies = [
{ name = "python-pptx" }, { name = "python-pptx" },
{ name = "requests" }, { name = "requests" },
{ name = "speechrecognition" }, { name = "speechrecognition" },
{ name = "xlrd" },
] ]
[package.metadata] [package.metadata]
@ -368,7 +357,6 @@ requires-dist = [
{ name = "mammoth" }, { name = "mammoth" },
{ name = "markdownify" }, { name = "markdownify" },
{ name = "olefile" }, { name = "olefile" },
{ name = "openpyxl" },
{ name = "pandas" }, { name = "pandas" },
{ name = "pydub" }, { name = "pydub" },
{ name = "pymupdf", specifier = ">=1.25.5" }, { name = "pymupdf", specifier = ">=1.25.5" },
@ -376,7 +364,6 @@ requires-dist = [
{ name = "python-pptx" }, { name = "python-pptx" },
{ name = "requests" }, { name = "requests" },
{ name = "speechrecognition" }, { name = "speechrecognition" },
{ name = "xlrd" },
] ]
[[package]] [[package]]
@ -492,18 +479,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 }, { url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
] ]
[[package]]
name = "openpyxl"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "et-xmlfile" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 },
]
[[package]] [[package]]
name = "packaging" name = "packaging"
version = "25.0" version = "25.0"
@ -847,15 +822,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 }, { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 },
] ]
[[package]]
name = "xlrd"
version = "2.0.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a6/b3/19a2540d21dea5f908304375bd43f5ed7a4c28a370dc9122c565423e6b44/xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88", size = 100259 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a6/0c/c2a72d51fe56e08a08acc85d13013558a2d793028ae7385448a6ccdfae64/xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd", size = 96531 },
]
[[package]] [[package]]
name = "xlsxwriter" name = "xlsxwriter"
version = "3.2.3" version = "3.2.3"