Merge pull request #2 from pathintegral-institute/rong/tech-139-modality-conversion
Rong/tech 139 modality conversion
This commit is contained in:
commit
cd85971867
21 changed files with 257 additions and 470 deletions
|
|
@ -32,8 +32,6 @@ dependencies = [
|
||||||
"python-pptx",
|
"python-pptx",
|
||||||
"mammoth",
|
"mammoth",
|
||||||
"pandas",
|
"pandas",
|
||||||
"openpyxl",
|
|
||||||
"xlrd",
|
|
||||||
"lxml",
|
"lxml",
|
||||||
"olefile",
|
"olefile",
|
||||||
"pydub",
|
"pydub",
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,8 @@ from ._markitup import (
|
||||||
MarkItUp,
|
MarkItUp,
|
||||||
)
|
)
|
||||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||||
from ._stream_info import StreamInfo
|
from ._schemas import StreamInfo, Config
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItUpException,
|
|
||||||
MissingDependencyException,
|
MissingDependencyException,
|
||||||
FailedConversionAttempt,
|
FailedConversionAttempt,
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
|
|
@ -21,10 +20,10 @@ __all__ = [
|
||||||
"MarkItUp",
|
"MarkItUp",
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
"DocumentConverterResult",
|
"DocumentConverterResult",
|
||||||
"MarkItUpException",
|
|
||||||
"MissingDependencyException",
|
"MissingDependencyException",
|
||||||
"FailedConversionAttempt",
|
"FailedConversionAttempt",
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
"StreamInfo",
|
"StreamInfo",
|
||||||
|
"Config"
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,9 @@ import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
||||||
from ._stream_info import StreamInfo
|
from ._schemas import StreamInfo
|
||||||
import re
|
import re
|
||||||
|
import base64
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverterResult:
|
class DocumentConverterResult:
|
||||||
|
|
@ -11,9 +12,11 @@ class DocumentConverterResult:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
markdown: str,
|
markdown: str = "",
|
||||||
*,
|
*,
|
||||||
title: Optional[str] = None,
|
title: Optional[str] = None,
|
||||||
|
audio_stream: Optional[BinaryIO] = None,
|
||||||
|
stream_info: Optional[StreamInfo] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the DocumentConverterResult.
|
Initialize the DocumentConverterResult.
|
||||||
|
|
@ -26,7 +29,9 @@ class DocumentConverterResult:
|
||||||
- title: Optional title of the document.
|
- title: Optional title of the document.
|
||||||
"""
|
"""
|
||||||
self.markdown = markdown
|
self.markdown = markdown
|
||||||
|
self.audio_stream = audio_stream
|
||||||
self.title = title
|
self.title = title
|
||||||
|
self.stream_info = stream_info
|
||||||
|
|
||||||
def to_llm(self) -> List[Dict[str, Any]]:
|
def to_llm(self) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -40,7 +45,6 @@ class DocumentConverterResult:
|
||||||
(text and images) in their original order.
|
(text and images) in their original order.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# Pattern to match markdown image syntax with base64 data
|
# Pattern to match markdown image syntax with base64 data
|
||||||
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
|
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'
|
||||||
|
|
||||||
|
|
@ -80,7 +84,14 @@ class DocumentConverterResult:
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": text_chunk
|
"text": text_chunk
|
||||||
})
|
})
|
||||||
|
if self.audio_stream:
|
||||||
|
audio_b64 = base64.b64encode(
|
||||||
|
self.audio_stream.read()).decode('utf-8')
|
||||||
|
content.append({
|
||||||
|
"type": "media",
|
||||||
|
"mime_type": self.stream_info.magic_type,
|
||||||
|
"data": audio_b64
|
||||||
|
})
|
||||||
return content
|
return content
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
||||||
|
|
@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential
|
||||||
* etc."""
|
* etc."""
|
||||||
|
|
||||||
|
|
||||||
class MarkItUpException(Exception):
|
class MissingDependencyException(Exception):
|
||||||
"""
|
|
||||||
Base exception class for MarkItUp.
|
|
||||||
"""
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MissingDependencyException(MarkItUpException):
|
|
||||||
"""
|
"""
|
||||||
Converters shipped with MarkItUp may depend on optional
|
Converters shipped with MarkItUp may depend on optional
|
||||||
dependencies. This exception is thrown when a converter's
|
dependencies. This exception is thrown when a converter's
|
||||||
|
|
@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class UnsupportedFormatException(MarkItUpException):
|
class UnsupportedFormatException(Exception):
|
||||||
"""
|
"""
|
||||||
Thrown when no suitable converter was found for the given file.
|
Thrown when no suitable converter was found for the given file.
|
||||||
"""
|
"""
|
||||||
|
|
@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class FailedConversionAttempt(object):
|
class FailedConversionAttempt(Exception):
|
||||||
"""
|
"""
|
||||||
Represents an a single attempt to convert a file.
|
Represents a single attempt to convert a file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
|
def __init__(self):
|
||||||
self.converter = converter
|
super().__init__(f"Conversion attempt failed!")
|
||||||
self.exc_info = exc_info
|
|
||||||
|
|
||||||
|
|
||||||
class FileConversionException(MarkItUpException):
|
class FileConversionException(Exception):
|
||||||
"""
|
"""
|
||||||
Thrown when a suitable converter was found, but the conversion
|
Thrown when a suitable converter was found, but the conversion
|
||||||
process fails for any reason.
|
process fails for any reason.
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ from urllib.parse import urlparse
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
from ._stream_info import StreamInfo
|
from ._schemas import StreamInfo, Config
|
||||||
|
|
||||||
from .converters import (
|
from .converters import (
|
||||||
PlainTextConverter,
|
PlainTextConverter,
|
||||||
|
|
@ -14,7 +14,7 @@ from .converters import (
|
||||||
XlsxConverter,
|
XlsxConverter,
|
||||||
XlsConverter,
|
XlsConverter,
|
||||||
PptxConverter,
|
PptxConverter,
|
||||||
# AudioConverter,
|
AudioConverter,
|
||||||
CsvConverter,
|
CsvConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -33,30 +33,42 @@ class MarkItUp:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
config: Optional[Dict[str, Any]] = None,
|
config: Config = Config(),
|
||||||
):
|
):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
|
def convert(self, stream: BinaryIO) -> Dict[DocumentConverterResult, StreamInfo]:
|
||||||
stream_info: StreamInfo = self._get_stream_info(stream)
|
stream_info: StreamInfo = self._get_stream_info(stream)
|
||||||
# Deal with unsupported file types
|
# Deal with unsupported file types
|
||||||
match stream_info.category:
|
|
||||||
case "ppt":
|
|
||||||
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead")
|
|
||||||
case "other":
|
|
||||||
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
match stream_info.category:
|
match stream_info.category:
|
||||||
case "text":
|
case "text":
|
||||||
return PlainTextConverter().convert(stream, stream_info), stream_info
|
return PlainTextConverter().convert(stream, stream_info), stream_info
|
||||||
case "pptx":
|
case "pptx":
|
||||||
return PptxConverter().convert(stream, stream_info), stream_info
|
return PptxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
case "pdf":
|
case "pdf":
|
||||||
return PdfConverter().convert(stream, stream_info), stream_info
|
return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case "audio":
|
||||||
|
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case "xlsx":
|
||||||
|
return XlsxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case "xls":
|
||||||
|
return XlsConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case "csv":
|
||||||
|
return CsvConverter().convert(stream, stream_info), stream_info
|
||||||
|
case "docx":
|
||||||
|
return DocxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case _:
|
||||||
|
match stream_info.category:
|
||||||
|
case "ppt":
|
||||||
|
raise UnsupportedFormatException(
|
||||||
|
".ppt files are not supported, try .pptx instead")
|
||||||
|
case "other":
|
||||||
|
raise UnsupportedFormatException(
|
||||||
|
f"{stream_info.magic_type} files are not supported")
|
||||||
except FailedConversionAttempt:
|
except FailedConversionAttempt:
|
||||||
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}")
|
raise FileConversionException(
|
||||||
return stream_info
|
f"Failed to convert file of type {stream_info.magic_type}")
|
||||||
|
|
||||||
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
|
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:
|
||||||
original_position = byte_stream.tell()
|
original_position = byte_stream.tell()
|
||||||
|
|
@ -91,7 +103,12 @@ class MarkItUp:
|
||||||
category = "docx"
|
category = "docx"
|
||||||
elif magic_type == "application/pdf":
|
elif magic_type == "application/pdf":
|
||||||
category = "pdf"
|
category = "pdf"
|
||||||
|
elif magic_type == "application/csv":
|
||||||
|
category = "csv"
|
||||||
elif magic_type.startswith("text/"):
|
elif magic_type.startswith("text/"):
|
||||||
|
if magic_type == "text/csv":
|
||||||
|
category = "csv"
|
||||||
|
else:
|
||||||
category = "text"
|
category = "text"
|
||||||
else:
|
else:
|
||||||
category = "other"
|
category = "other"
|
||||||
|
|
|
||||||
15
packages/markitup/src/markitup/_schemas.py
Normal file
15
packages/markitup/src/markitup/_schemas.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
from typing import Optional, List, Literal
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StreamInfo:
|
||||||
|
magic_type: Optional[str] = None
|
||||||
|
category: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Config:
|
||||||
|
modalities: List[Literal["image", "audio"]] = field(
|
||||||
|
default_factory=lambda: ["image", "audio"]
|
||||||
|
)
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
from dataclasses import dataclass, asdict
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class StreamInfo:
|
|
||||||
magic_type: Optional[str] = None
|
|
||||||
category: Optional[str] = None
|
|
||||||
|
|
@ -1,7 +1,11 @@
|
||||||
import os
|
import os
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from markitup._stream_info import StreamInfo
|
from markitup._schemas import StreamInfo
|
||||||
import magic
|
import magic
|
||||||
|
import speech_recognition as sr
|
||||||
|
import pydub
|
||||||
|
import io
|
||||||
|
from typing import BinaryIO
|
||||||
|
|
||||||
|
|
||||||
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
|
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
|
||||||
|
|
@ -38,65 +42,23 @@ def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
|
||||||
return byte_streams
|
return byte_streams
|
||||||
|
|
||||||
|
|
||||||
def detect_file_types(file_dict):
|
def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
|
||||||
"""
|
audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None
|
||||||
Detects file types for a dictionary of {filename: BytesIO} pairs
|
|
||||||
using only magic type (content-based detection)
|
|
||||||
|
|
||||||
Args:
|
match audio_format:
|
||||||
file_dict (dict): Dictionary with filenames as keys and BytesIO objects as values
|
case 'mp3':
|
||||||
|
audio_segment = pydub.AudioSegment.from_file(
|
||||||
|
file_stream, format=audio_format)
|
||||||
|
audio_source = io.BytesIO()
|
||||||
|
audio_segment.export(audio_source, format="wav")
|
||||||
|
audio_source.seek(0)
|
||||||
|
case 'wav':
|
||||||
|
audio_source = file_stream
|
||||||
|
case _:
|
||||||
|
raise ValueError(f"Unsupported audio format: {magic_type}")
|
||||||
|
|
||||||
Returns:
|
recognizer = sr.Recognizer()
|
||||||
dict: Dictionary with filenames as keys and file type information as values
|
with sr.AudioFile(audio_source) as source:
|
||||||
"""
|
audio = recognizer.record(source)
|
||||||
result = {}
|
transcript = recognizer.recognize_google(audio).strip()
|
||||||
|
return "[No speech detected]" if transcript == "" else transcript
|
||||||
for filename, byte_stream in file_dict.items():
|
|
||||||
# Get the original position to reset later
|
|
||||||
original_position = byte_stream.tell()
|
|
||||||
|
|
||||||
# Reset stream position to beginning
|
|
||||||
byte_stream.seek(0)
|
|
||||||
|
|
||||||
# Get file content for analysis
|
|
||||||
file_content = byte_stream.read()
|
|
||||||
|
|
||||||
# Use python-magic to determine file type based on content
|
|
||||||
magic_type = magic.from_buffer(file_content, mime=True)
|
|
||||||
|
|
||||||
# Determine file category based on magic_type
|
|
||||||
if magic_type.startswith("image/"):
|
|
||||||
category = "image"
|
|
||||||
elif magic_type.startswith("audio/"):
|
|
||||||
category = "audio"
|
|
||||||
elif magic_type.startswith("video/"):
|
|
||||||
category = "video"
|
|
||||||
elif (
|
|
||||||
magic_type.startswith("application/vnd.ms-excel")
|
|
||||||
or magic_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
||||||
):
|
|
||||||
category = "xls"
|
|
||||||
elif (
|
|
||||||
magic_type.startswith("application/vnd.ms-powerpoint")
|
|
||||||
or magic_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
||||||
):
|
|
||||||
category = "ppt"
|
|
||||||
elif (
|
|
||||||
magic_type.startswith("application/msword")
|
|
||||||
or magic_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
):
|
|
||||||
category = "doc"
|
|
||||||
elif magic_type == "application/pdf":
|
|
||||||
category = "pdf"
|
|
||||||
elif magic_type.startswith("text/"):
|
|
||||||
category = "text"
|
|
||||||
else:
|
|
||||||
category = "other"
|
|
||||||
|
|
||||||
# Store the results
|
|
||||||
result[filename] = StreamInfo(magic_type=magic_type, category=category)
|
|
||||||
|
|
||||||
# Reset stream position
|
|
||||||
byte_stream.seek(original_position)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter
|
||||||
from ._docx_converter import DocxConverter
|
from ._docx_converter import DocxConverter
|
||||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||||
from ._pptx_converter import PptxConverter
|
from ._pptx_converter import PptxConverter
|
||||||
# from ._audio_converter import AudioConverter
|
from ._audio_converter import AudioConverter
|
||||||
from ._csv_converter import CsvConverter
|
from ._csv_converter import CsvConverter
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
@ -27,7 +27,7 @@ __all__ = [
|
||||||
"XlsConverter",
|
"XlsConverter",
|
||||||
"PptxConverter",
|
"PptxConverter",
|
||||||
"ImageConverter",
|
"ImageConverter",
|
||||||
# "AudioConverter",
|
"AudioConverter",
|
||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
|
|
||||||
|
|
@ -1,23 +1,10 @@
|
||||||
import io
|
import io
|
||||||
from typing import Any, BinaryIO, Optional
|
from typing import Any, BinaryIO, Optional, Tuple
|
||||||
|
|
||||||
from ._exiftool import exiftool_metadata
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
from .._exceptions import MissingDependencyException
|
from .._exceptions import MissingDependencyException
|
||||||
|
from ..converter_utils.utils import transcribe_audio
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
||||||
"audio/x-wav",
|
|
||||||
"audio/mpeg",
|
|
||||||
"video/mp4",
|
|
||||||
]
|
|
||||||
|
|
||||||
ACCEPTED_FILE_EXTENSIONS = [
|
|
||||||
".wav",
|
|
||||||
".mp3",
|
|
||||||
".m4a",
|
|
||||||
".mp4",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class AudioConverter(DocumentConverter):
|
class AudioConverter(DocumentConverter):
|
||||||
|
|
@ -25,77 +12,25 @@ class AudioConverter(DocumentConverter):
|
||||||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def accepts(
|
def __init__(self, config: Config):
|
||||||
self,
|
self.config = config
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
** kwargs: Any, # Options to pass to the converter
|
** kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> Tuple[DocumentConverterResult, StreamInfo]:
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
|
||||||
metadata = exiftool_metadata(
|
|
||||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
|
||||||
)
|
|
||||||
if metadata:
|
|
||||||
for f in [
|
|
||||||
"Title",
|
|
||||||
"Artist",
|
|
||||||
"Author",
|
|
||||||
"Band",
|
|
||||||
"Album",
|
|
||||||
"Genre",
|
|
||||||
"Track",
|
|
||||||
"DateTimeOriginal",
|
|
||||||
"CreateDate",
|
|
||||||
# "Duration", -- Wrong values when read from memory
|
|
||||||
"NumChannels",
|
|
||||||
"SampleRate",
|
|
||||||
"AvgBytesPerSec",
|
|
||||||
"BitsPerSample",
|
|
||||||
]:
|
|
||||||
if f in metadata:
|
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
|
||||||
|
|
||||||
# Figure out the audio format for transcription
|
|
||||||
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
|
|
||||||
audio_format = "wav"
|
|
||||||
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
|
|
||||||
audio_format = "mp3"
|
|
||||||
elif (
|
|
||||||
stream_info.extension in [".mp4", ".m4a"]
|
|
||||||
or stream_info.mimetype == "video/mp4"
|
|
||||||
):
|
|
||||||
audio_format = "mp4"
|
|
||||||
else:
|
|
||||||
audio_format = None
|
|
||||||
|
|
||||||
# Transcribe
|
# Transcribe
|
||||||
if audio_format:
|
if 'audio' not in self.config.modalities:
|
||||||
try:
|
transcript = transcribe_audio(
|
||||||
transcript = transcribe_audio(file_stream, audio_format=audio_format)
|
file_stream, magic_type=stream_info.magic_type)
|
||||||
if transcript:
|
if transcript:
|
||||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||||
except MissingDependencyException:
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
pass
|
else:
|
||||||
|
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
|
||||||
|
|
||||||
# Return the result
|
# Return the result
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
|
||||||
|
|
|
||||||
|
|
@ -3,15 +3,8 @@ import csv
|
||||||
import io
|
import io
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from charset_normalizer import from_bytes
|
from charset_normalizer import from_bytes
|
||||||
from ._html_converter import HtmlConverter
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
|
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
||||||
"text/csv",
|
|
||||||
"application/csv",
|
|
||||||
]
|
|
||||||
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
|
||||||
|
|
||||||
|
|
||||||
class CsvConverter(DocumentConverter):
|
class CsvConverter(DocumentConverter):
|
||||||
|
|
@ -19,24 +12,6 @@ class CsvConverter(DocumentConverter):
|
||||||
Converts CSV files to Markdown tables.
|
Converts CSV files to Markdown tables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def accepts(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -44,9 +19,6 @@ class CsvConverter(DocumentConverter):
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Read the file content
|
# Read the file content
|
||||||
if stream_info.charset:
|
|
||||||
content = file_stream.read().decode(stream_info.charset)
|
|
||||||
else:
|
|
||||||
content = str(from_bytes(file_stream.read()).best())
|
content = str(from_bytes(file_stream.read()).best())
|
||||||
|
|
||||||
# Parse CSV content
|
# Parse CSV content
|
||||||
|
|
|
||||||
|
|
@ -5,24 +5,8 @@ from typing import BinaryIO, Any
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from ..converter_utils.docx.pre_process import pre_process_docx
|
from ..converter_utils.docx.pre_process import pre_process_docx
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
|
||||||
# Save reporting of any exceptions for later
|
|
||||||
_dependency_exc_info = None
|
|
||||||
try:
|
|
||||||
import mammoth
|
import mammoth
|
||||||
except ImportError:
|
|
||||||
# Preserve the error and stack trace for later
|
|
||||||
_dependency_exc_info = sys.exc_info()
|
|
||||||
|
|
||||||
|
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
||||||
]
|
|
||||||
|
|
||||||
ACCEPTED_FILE_EXTENSIONS = [".docx"]
|
|
||||||
|
|
||||||
|
|
||||||
class DocxConverter(HtmlConverter):
|
class DocxConverter(HtmlConverter):
|
||||||
|
|
@ -30,27 +14,8 @@ class DocxConverter(HtmlConverter):
|
||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config: Config):
|
||||||
super().__init__()
|
self._html_converter = HtmlConverter(config=config)
|
||||||
self._html_converter = HtmlConverter()
|
|
||||||
|
|
||||||
def accepts(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
|
|
@ -58,23 +23,11 @@ class DocxConverter(HtmlConverter):
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Check: the dependencies
|
|
||||||
if _dependency_exc_info is not None:
|
|
||||||
raise MissingDependencyException(
|
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
|
||||||
converter=type(self).__name__,
|
|
||||||
extension=".docx",
|
|
||||||
feature="docx",
|
|
||||||
)
|
|
||||||
) from _dependency_exc_info[
|
|
||||||
1
|
|
||||||
].with_traceback( # type: ignore[union-attr]
|
|
||||||
_dependency_exc_info[2]
|
|
||||||
)
|
|
||||||
|
|
||||||
style_map = kwargs.get("style_map", None)
|
style_map = kwargs.get("style_map", None)
|
||||||
pre_process_stream = pre_process_docx(file_stream)
|
pre_process_stream = pre_process_docx(file_stream)
|
||||||
return self._html_converter.convert_string(
|
return self._html_converter.convert_string(
|
||||||
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
|
mammoth.convert_to_html(
|
||||||
|
pre_process_stream, style_map=style_map).value,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
ACCEPTED_MAGIC_TYPE_PREFIXES = [
|
||||||
|
|
@ -19,6 +19,10 @@ ACCEPTED_FILE_CATEGORY = [
|
||||||
|
|
||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
"""Anything with content type text/html"""
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
|
def __init__(self, config: Config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -27,7 +31,8 @@ class HtmlConverter(DocumentConverter):
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Parse the stream
|
# Parse the stream
|
||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
soup = BeautifulSoup(file_stream, "html.parser",
|
||||||
|
from_encoding=encoding)
|
||||||
|
|
||||||
# Remove javascript and style blocks
|
# Remove javascript and style blocks
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
|
|
@ -37,15 +42,17 @@ class HtmlConverter(DocumentConverter):
|
||||||
body_elm = soup.find("body")
|
body_elm = soup.find("body")
|
||||||
webpage_text = ""
|
webpage_text = ""
|
||||||
if body_elm:
|
if body_elm:
|
||||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
webpage_text = _CustomMarkdownify(
|
||||||
|
config=self.config, **kwargs).convert_soup(body_elm)
|
||||||
else:
|
else:
|
||||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
webpage_text = _CustomMarkdownify(
|
||||||
|
config=self.config, **kwargs).convert_soup(soup)
|
||||||
|
|
||||||
assert isinstance(webpage_text, str)
|
assert isinstance(webpage_text, str)
|
||||||
|
|
||||||
# remove leading and trailing \n
|
# remove leading and trailing \n
|
||||||
webpage_text = webpage_text.strip()
|
webpage_text = webpage_text.strip()
|
||||||
|
print(webpage_text)
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=webpage_text,
|
markdown=webpage_text,
|
||||||
title=None if soup.title is None else soup.title.string,
|
title=None if soup.title is None else soup.title.string,
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import markdownify
|
||||||
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
from urllib.parse import quote, unquote, urlparse, urlunparse
|
from urllib.parse import quote, unquote, urlparse, urlunparse
|
||||||
|
from .._schemas import Config
|
||||||
|
|
||||||
|
|
||||||
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
|
|
@ -15,11 +16,13 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, **options: Any):
|
def __init__(self, config: Config, **options: Any):
|
||||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
options["heading_style"] = options.get(
|
||||||
|
"heading_style", markdownify.ATX)
|
||||||
options["keep_data_uris"] = options.get("keep_data_uris", False)
|
options["keep_data_uris"] = options.get("keep_data_uris", False)
|
||||||
# Explicitly cast options to the expected type if necessary
|
# Explicitly cast options to the expected type if necessary
|
||||||
super().__init__(**options)
|
super().__init__(**options)
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def convert_hn(
|
def convert_hn(
|
||||||
self,
|
self,
|
||||||
|
|
@ -58,9 +61,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
if href:
|
if href:
|
||||||
try:
|
try:
|
||||||
parsed_url = urlparse(href) # type: ignore
|
parsed_url = urlparse(href) # type: ignore
|
||||||
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
# type: ignore
|
||||||
|
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:
|
||||||
return "%s%s%s" % (prefix, text, suffix)
|
return "%s%s%s" % (prefix, text, suffix)
|
||||||
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
href = urlunparse(parsed_url._replace(
|
||||||
|
path=quote(unquote(parsed_url.path)))) # type: ignore
|
||||||
except ValueError: # It's not clear if this ever gets thrown
|
except ValueError: # It's not clear if this ever gets thrown
|
||||||
return "%s%s%s" % (prefix, text, suffix)
|
return "%s%s%s" % (prefix, text, suffix)
|
||||||
|
|
||||||
|
|
@ -95,17 +100,11 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||||
src = el.attrs.get("src", None) or ""
|
src = el.attrs.get("src", None) or ""
|
||||||
title = el.attrs.get("title", None) or ""
|
title = el.attrs.get("title", None) or ""
|
||||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||||
if (
|
|
||||||
convert_as_inline
|
|
||||||
and el.parent.name not in self.options["keep_inline_images_in"]
|
|
||||||
):
|
|
||||||
return alt
|
|
||||||
|
|
||||||
# Remove dataURIs
|
|
||||||
if src.startswith("data:") and not self.options["keep_data_uris"]:
|
|
||||||
src = src.split(",")[0] + "..."
|
|
||||||
|
|
||||||
|
if "image" in self.config.modalities:
|
||||||
return "" % (alt, src, title_part)
|
return "" % (alt, src, title_part)
|
||||||
|
else:
|
||||||
|
return alt
|
||||||
|
|
||||||
def convert_soup(self, soup: Any) -> str:
|
def convert_soup(self, soup: Any) -> str:
|
||||||
return super().convert_soup(soup) # type: ignore
|
return super().convert_soup(soup) # type: ignore
|
||||||
|
|
@ -3,7 +3,7 @@ import io
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
|
|
@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter):
|
||||||
Converts PDFs to Markdown with embedded images.
|
Converts PDFs to Markdown with embedded images.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -36,7 +39,7 @@ class PdfConverter(DocumentConverter):
|
||||||
|
|
||||||
# Extract images from the page
|
# Extract images from the page
|
||||||
image_list = page.get_images(full=True)
|
image_list = page.get_images(full=True)
|
||||||
|
if 'image' in self.config.modalities:
|
||||||
for img_index, img_info in enumerate(image_list):
|
for img_index, img_info in enumerate(image_list):
|
||||||
xref = img_info[0] # Get the image reference
|
xref = img_info[0] # Get the image reference
|
||||||
base_image = doc.extract_image(xref)
|
base_image = doc.extract_image(xref)
|
||||||
|
|
@ -47,16 +50,17 @@ class PdfConverter(DocumentConverter):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Convert image to base64 for markdown embedding
|
# Convert image to base64 for markdown embedding
|
||||||
img_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
img_base64 = base64.b64encode(
|
||||||
|
image_bytes).decode('utf-8')
|
||||||
# Add image to markdown with a unique identifier
|
# Add image to markdown with a unique identifier
|
||||||
image_count += 1
|
image_count += 1
|
||||||
markdown_content += f"\n\n"
|
markdown_content += f"\n\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
||||||
|
else:
|
||||||
|
markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
|
||||||
# Close the document to free resources
|
# Close the document to free resources
|
||||||
doc.close()
|
doc.close()
|
||||||
print(markdown_content)
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=markdown_content,
|
markdown=markdown_content,
|
||||||
)
|
)
|
||||||
|
|
@ -1,11 +1,12 @@
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from charset_normalizer import from_bytes
|
from charset_normalizer import from_bytes
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo
|
||||||
|
|
||||||
|
|
||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from operator import attrgetter
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
import pptx
|
import pptx
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter):
|
||||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config: Config):
|
||||||
super().__init__()
|
self._html_converter = HtmlConverter(config=config)
|
||||||
self._html_converter = HtmlConverter()
|
self.config = config
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
|
|
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
|
||||||
|
|
||||||
# Also grab any description embedded in the deck
|
# Also grab any description embedded in the deck
|
||||||
try:
|
try:
|
||||||
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
|
||||||
|
"descr", "")
|
||||||
except Exception:
|
except Exception:
|
||||||
# Unable to get alt text
|
# Unable to get alt text
|
||||||
pass
|
pass
|
||||||
|
|
@ -69,16 +70,20 @@ class PptxConverter(DocumentConverter):
|
||||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||||
|
|
||||||
# If keep_data_uris is True, use base64 encoding for images
|
# If keep_data_uris is True, use base64 encoding for images
|
||||||
|
if 'image' in self.config.modalities:
|
||||||
blob = shape.image.blob
|
blob = shape.image.blob
|
||||||
content_type = shape.image.content_type or "image/png"
|
content_type = shape.image.content_type or "image/png"
|
||||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
md_content += f"\n\n"
|
md_content += f"\n\n"
|
||||||
|
else:
|
||||||
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
|
md_content += "\n\n"
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
|
md_content += self._convert_table_to_markdown(
|
||||||
|
shape.table, **kwargs)
|
||||||
|
|
||||||
# Charts
|
# Charts
|
||||||
if shape.has_chart:
|
if shape.has_chart:
|
||||||
|
|
@ -93,7 +98,8 @@ class PptxConverter(DocumentConverter):
|
||||||
|
|
||||||
# Group Shapes
|
# Group Shapes
|
||||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||||
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
|
sorted_shapes = sorted(
|
||||||
|
shape.shapes, key=attrgetter("top", "left"))
|
||||||
for subshape in sorted_shapes:
|
for subshape in sorted_shapes:
|
||||||
get_shape_content(subshape, **kwargs)
|
get_shape_content(subshape, **kwargs)
|
||||||
|
|
||||||
|
|
@ -141,7 +147,8 @@ class PptxConverter(DocumentConverter):
|
||||||
html_table += "</table></body></html>"
|
html_table += "</table></body></html>"
|
||||||
|
|
||||||
return (
|
return (
|
||||||
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
|
self._html_converter.convert_string(
|
||||||
|
html_table, **kwargs).markdown.strip()
|
||||||
+ "\n"
|
+ "\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,36 +1,8 @@
|
||||||
import sys
|
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._schemas import StreamInfo, Config
|
||||||
from .._stream_info import StreamInfo
|
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
|
||||||
# Save reporting of any exceptions for later
|
|
||||||
_xlsx_dependency_exc_info = None
|
|
||||||
try:
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import openpyxl
|
|
||||||
except ImportError:
|
|
||||||
_xlsx_dependency_exc_info = sys.exc_info()
|
|
||||||
|
|
||||||
_xls_dependency_exc_info = None
|
|
||||||
try:
|
|
||||||
import pandas as pd
|
|
||||||
import xlrd
|
|
||||||
except ImportError:
|
|
||||||
_xls_dependency_exc_info = sys.exc_info()
|
|
||||||
|
|
||||||
ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
|
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
||||||
]
|
|
||||||
ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
|
|
||||||
|
|
||||||
ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
|
|
||||||
"application/vnd.ms-excel",
|
|
||||||
"application/excel",
|
|
||||||
]
|
|
||||||
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
|
|
||||||
|
|
||||||
|
|
||||||
class XlsxConverter(DocumentConverter):
|
class XlsxConverter(DocumentConverter):
|
||||||
|
|
@ -38,27 +10,8 @@ class XlsxConverter(DocumentConverter):
|
||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config: Config):
|
||||||
super().__init__()
|
self._html_converter = HtmlConverter(config=config)
|
||||||
self._html_converter = HtmlConverter()
|
|
||||||
|
|
||||||
def accepts(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
|
|
@ -66,20 +19,6 @@ class XlsxConverter(DocumentConverter):
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Check the dependencies
|
|
||||||
if _xlsx_dependency_exc_info is not None:
|
|
||||||
raise MissingDependencyException(
|
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
|
||||||
converter=type(self).__name__,
|
|
||||||
extension=".xlsx",
|
|
||||||
feature="xlsx",
|
|
||||||
)
|
|
||||||
) from _xlsx_dependency_exc_info[
|
|
||||||
1
|
|
||||||
].with_traceback( # type: ignore[union-attr]
|
|
||||||
_xlsx_dependency_exc_info[2]
|
|
||||||
)
|
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
for s in sheets:
|
for s in sheets:
|
||||||
|
|
@ -100,27 +39,8 @@ class XlsConverter(DocumentConverter):
|
||||||
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config: Config):
|
||||||
super().__init__()
|
self._html_converter = HtmlConverter(config=config)
|
||||||
self._html_converter = HtmlConverter()
|
|
||||||
|
|
||||||
def accepts(
|
|
||||||
self,
|
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
|
|
@ -128,19 +48,6 @@ class XlsConverter(DocumentConverter):
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Load the dependencies
|
|
||||||
if _xls_dependency_exc_info is not None:
|
|
||||||
raise MissingDependencyException(
|
|
||||||
MISSING_DEPENDENCY_MESSAGE.format(
|
|
||||||
converter=type(self).__name__,
|
|
||||||
extension=".xls",
|
|
||||||
feature="xls",
|
|
||||||
)
|
|
||||||
) from _xls_dependency_exc_info[
|
|
||||||
1
|
|
||||||
].with_traceback( # type: ignore[union-attr]
|
|
||||||
_xls_dependency_exc_info[2]
|
|
||||||
)
|
|
||||||
|
|
||||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
|
||||||
51
packages/markitup/tests/test_files/test.csv
Normal file
51
packages/markitup/tests/test_files/test.csv
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
ID,Name,Age,Country,Email
|
||||||
|
1,Name_1,62,Country_1,email_1@example.com
|
||||||
|
2,Name_2,48,Country_2,email_2@example.com
|
||||||
|
3,Name_3,61,Country_3,email_3@example.com
|
||||||
|
4,Name_4,32,Country_4,email_4@example.com
|
||||||
|
5,Name_5,69,Country_5,email_5@example.com
|
||||||
|
6,Name_6,32,Country_6,email_6@example.com
|
||||||
|
7,Name_7,62,Country_7,email_7@example.com
|
||||||
|
8,Name_8,39,Country_8,email_8@example.com
|
||||||
|
9,Name_9,40,Country_9,email_9@example.com
|
||||||
|
10,Name_10,32,Country_0,email_10@example.com
|
||||||
|
11,Name_11,24,Country_1,email_11@example.com
|
||||||
|
12,Name_12,45,Country_2,email_12@example.com
|
||||||
|
13,Name_13,39,Country_3,email_13@example.com
|
||||||
|
14,Name_14,18,Country_4,email_14@example.com
|
||||||
|
15,Name_15,66,Country_5,email_15@example.com
|
||||||
|
16,Name_16,48,Country_6,email_16@example.com
|
||||||
|
17,Name_17,60,Country_7,email_17@example.com
|
||||||
|
18,Name_18,31,Country_8,email_18@example.com
|
||||||
|
19,Name_19,43,Country_9,email_19@example.com
|
||||||
|
20,Name_20,33,Country_0,email_20@example.com
|
||||||
|
21,Name_21,32,Country_1,email_21@example.com
|
||||||
|
22,Name_22,68,Country_2,email_22@example.com
|
||||||
|
23,Name_23,44,Country_3,email_23@example.com
|
||||||
|
24,Name_24,32,Country_4,email_24@example.com
|
||||||
|
25,Name_25,33,Country_5,email_25@example.com
|
||||||
|
26,Name_26,46,Country_6,email_26@example.com
|
||||||
|
27,Name_27,38,Country_7,email_27@example.com
|
||||||
|
28,Name_28,50,Country_8,email_28@example.com
|
||||||
|
29,Name_29,68,Country_9,email_29@example.com
|
||||||
|
30,Name_30,66,Country_0,email_30@example.com
|
||||||
|
31,Name_31,60,Country_1,email_31@example.com
|
||||||
|
32,Name_32,53,Country_2,email_32@example.com
|
||||||
|
33,Name_33,30,Country_3,email_33@example.com
|
||||||
|
34,Name_34,30,Country_4,email_34@example.com
|
||||||
|
35,Name_35,43,Country_5,email_35@example.com
|
||||||
|
36,Name_36,44,Country_6,email_36@example.com
|
||||||
|
37,Name_37,31,Country_7,email_37@example.com
|
||||||
|
38,Name_38,35,Country_8,email_38@example.com
|
||||||
|
39,Name_39,56,Country_9,email_39@example.com
|
||||||
|
40,Name_40,35,Country_0,email_40@example.com
|
||||||
|
41,Name_41,62,Country_1,email_41@example.com
|
||||||
|
42,Name_42,63,Country_2,email_42@example.com
|
||||||
|
43,Name_43,51,Country_3,email_43@example.com
|
||||||
|
44,Name_44,52,Country_4,email_44@example.com
|
||||||
|
45,Name_45,66,Country_5,email_45@example.com
|
||||||
|
46,Name_46,69,Country_6,email_46@example.com
|
||||||
|
47,Name_47,68,Country_7,email_47@example.com
|
||||||
|
48,Name_48,68,Country_8,email_48@example.com
|
||||||
|
49,Name_49,69,Country_9,email_49@example.com
|
||||||
|
50,Name_50,46,Country_0,email_50@example.com
|
||||||
|
BIN
packages/markitup/tests/test_files/test.docx
Executable file → Normal file
BIN
packages/markitup/tests/test_files/test.docx
Executable file → Normal file
Binary file not shown.
|
|
@ -173,15 +173,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
|
{ url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "et-xmlfile"
|
|
||||||
version = "2.0.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flatbuffers"
|
name = "flatbuffers"
|
||||||
version = "25.2.10"
|
version = "25.2.10"
|
||||||
|
|
@ -348,7 +339,6 @@ dependencies = [
|
||||||
{ name = "mammoth" },
|
{ name = "mammoth" },
|
||||||
{ name = "markdownify" },
|
{ name = "markdownify" },
|
||||||
{ name = "olefile" },
|
{ name = "olefile" },
|
||||||
{ name = "openpyxl" },
|
|
||||||
{ name = "pandas" },
|
{ name = "pandas" },
|
||||||
{ name = "pydub" },
|
{ name = "pydub" },
|
||||||
{ name = "pymupdf" },
|
{ name = "pymupdf" },
|
||||||
|
|
@ -356,7 +346,6 @@ dependencies = [
|
||||||
{ name = "python-pptx" },
|
{ name = "python-pptx" },
|
||||||
{ name = "requests" },
|
{ name = "requests" },
|
||||||
{ name = "speechrecognition" },
|
{ name = "speechrecognition" },
|
||||||
{ name = "xlrd" },
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
|
|
@ -368,7 +357,6 @@ requires-dist = [
|
||||||
{ name = "mammoth" },
|
{ name = "mammoth" },
|
||||||
{ name = "markdownify" },
|
{ name = "markdownify" },
|
||||||
{ name = "olefile" },
|
{ name = "olefile" },
|
||||||
{ name = "openpyxl" },
|
|
||||||
{ name = "pandas" },
|
{ name = "pandas" },
|
||||||
{ name = "pydub" },
|
{ name = "pydub" },
|
||||||
{ name = "pymupdf", specifier = ">=1.25.5" },
|
{ name = "pymupdf", specifier = ">=1.25.5" },
|
||||||
|
|
@ -376,7 +364,6 @@ requires-dist = [
|
||||||
{ name = "python-pptx" },
|
{ name = "python-pptx" },
|
||||||
{ name = "requests" },
|
{ name = "requests" },
|
||||||
{ name = "speechrecognition" },
|
{ name = "speechrecognition" },
|
||||||
{ name = "xlrd" },
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -492,18 +479,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
|
{ url = "https://files.pythonhosted.org/packages/3a/72/5ff85c540fd6a465610ce47e4cee8fccb472952fc1d589112f51ae2520a5/onnxruntime-1.21.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c9e4571ff5b2a5d377d414bc85cd9450ba233a9a92f766493874f1093976453", size = 15990556 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "openpyxl"
|
|
||||||
version = "3.1.5"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "et-xmlfile" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "packaging"
|
name = "packaging"
|
||||||
version = "25.0"
|
version = "25.0"
|
||||||
|
|
@ -847,15 +822,6 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 },
|
{ url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "xlrd"
|
|
||||||
version = "2.0.1"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/a6/b3/19a2540d21dea5f908304375bd43f5ed7a4c28a370dc9122c565423e6b44/xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88", size = 100259 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/a6/0c/c2a72d51fe56e08a08acc85d13013558a2d793028ae7385448a6ccdfae64/xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd", size = 96531 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xlsxwriter"
|
name = "xlsxwriter"
|
||||||
version = "3.2.3"
|
version = "3.2.3"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue