This commit is contained in:
rong-xyz 2025-04-22 07:00:30 +00:00
parent e729da2b38
commit 03f3fa9829
14 changed files with 66 additions and 47 deletions

View file

@ -7,7 +7,7 @@ from ._markitup import (
MarkItUp, MarkItUp,
) )
from ._base_converter import DocumentConverterResult, DocumentConverter from ._base_converter import DocumentConverterResult, DocumentConverter
from ._stream_info import StreamInfo from ._schemas import StreamInfo, Config
from ._exceptions import ( from ._exceptions import (
MarkItUpException, MarkItUpException,
MissingDependencyException, MissingDependencyException,
@ -27,4 +27,5 @@ __all__ = [
"FileConversionException", "FileConversionException",
"UnsupportedFormatException", "UnsupportedFormatException",
"StreamInfo", "StreamInfo",
"Config"
] ]

View file

@ -2,7 +2,7 @@ import os
import tempfile import tempfile
from warnings import warn from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List, Dict from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._stream_info import StreamInfo from ._schemas import StreamInfo
import re import re
@ -40,7 +40,6 @@ class DocumentConverterResult:
(text and images) in their original order. (text and images) in their original order.
""" """
# Pattern to match markdown image syntax with base64 data # Pattern to match markdown image syntax with base64 data
pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)' pattern = r'!\[(.*?)\]\(data:(.*?);base64,(.*?)\)'

View file

@ -4,7 +4,7 @@ from urllib.parse import urlparse
from warnings import warn from warnings import warn
import magic import magic
from ._stream_info import StreamInfo from ._schemas import StreamInfo, Config
from .converters import ( from .converters import (
PlainTextConverter, PlainTextConverter,
@ -33,7 +33,7 @@ class MarkItUp:
def __init__( def __init__(
self, self,
config: Optional[Dict[str, Any]] = None, config: Config = Config(),
): ):
self.config = config self.config = config
@ -42,9 +42,11 @@ class MarkItUp:
# Deal with unsupported file types # Deal with unsupported file types
match stream_info.category: match stream_info.category:
case "ppt": case "ppt":
raise UnsupportedFormatException(".ppt files are not supported, try .pptx instead") raise UnsupportedFormatException(
".ppt files are not supported, try .pptx instead")
case "other": case "other":
raise UnsupportedFormatException(f"{stream_info.magic_type} files are not supported") raise UnsupportedFormatException(
f"{stream_info.magic_type} files are not supported")
try: try:
match stream_info.category: match stream_info.category:
@ -55,7 +57,8 @@ class MarkItUp:
case "pdf": case "pdf":
return PdfConverter().convert(stream, stream_info), stream_info return PdfConverter().convert(stream, stream_info), stream_info
except FailedConversionAttempt: except FailedConversionAttempt:
raise FileConversionException(f"Failed to convert file of type {stream_info.magic_type}") raise FileConversionException(
f"Failed to convert file of type {stream_info.magic_type}")
return stream_info return stream_info
def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo: def _get_stream_info(self, byte_stream: BinaryIO) -> StreamInfo:

View file

@ -0,0 +1,15 @@
from dataclasses import dataclass, asdict, field
from typing import Optional, List, Literal
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None
@dataclass
class Config:
modality: List[Literal["image", "audio"]] = field(
default_factory=lambda: ["image", "audio"]
)

View file

@ -1,8 +0,0 @@
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class StreamInfo:
magic_type: Optional[str] = None
category: Optional[str] = None

View file

@ -1,6 +1,6 @@
import os import os
from io import BytesIO from io import BytesIO
from markitup._stream_info import StreamInfo from markitup._schemas import StreamInfo
import magic import magic

View file

@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
from ._exiftool import exiftool_metadata from ._exiftool import exiftool_metadata
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
from .._exceptions import MissingDependencyException from .._exceptions import MissingDependencyException
ACCEPTED_MIME_TYPE_PREFIXES = [ ACCEPTED_MIME_TYPE_PREFIXES = [
@ -91,7 +91,8 @@ class AudioConverter(DocumentConverter):
# Transcribe # Transcribe
if audio_format: if audio_format:
try: try:
transcript = transcribe_audio(file_stream, audio_format=audio_format) transcript = transcribe_audio(
file_stream, audio_format=audio_format)
if transcript: if transcript:
md_content += "\n\n### Audio Transcript:\n" + transcript md_content += "\n\n### Audio Transcript:\n" + transcript
except MissingDependencyException: except MissingDependencyException:

View file

@ -5,7 +5,7 @@ from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [ ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv", "text/csv",

View file

@ -5,7 +5,7 @@ from typing import BinaryIO, Any
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
@ -75,6 +75,7 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None) style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream) pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string( return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, mammoth.convert_to_html(
pre_process_stream, style_map=style_map).value,
**kwargs, **kwargs,
) )

View file

@ -3,7 +3,7 @@ from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
ACCEPTED_MAGIC_TYPE_PREFIXES = [ ACCEPTED_MAGIC_TYPE_PREFIXES = [
@ -19,6 +19,7 @@ ACCEPTED_FILE_CATEGORY = [
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
"""Anything with content type text/html""" """Anything with content type text/html"""
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
@ -27,7 +28,8 @@ class HtmlConverter(DocumentConverter):
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Parse the stream # Parse the stream
encoding = "utf-8" encoding = "utf-8"
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) soup = BeautifulSoup(file_stream, "html.parser",
from_encoding=encoding)
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):

View file

@ -3,7 +3,7 @@ import io
import base64 import base64
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
import fitz import fitz
@ -47,7 +47,8 @@ class PdfConverter(DocumentConverter):
try: try:
# Convert image to base64 for markdown embedding # Convert image to base64 for markdown embedding
img_base64 = base64.b64encode(image_bytes).decode('utf-8') img_base64 = base64.b64encode(
image_bytes).decode('utf-8')
# Add image to markdown with a unique identifier # Add image to markdown with a unique identifier
image_count += 1 image_count += 1
markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"

View file

@ -1,11 +1,12 @@
from typing import BinaryIO, Any from typing import BinaryIO, Any
from charset_normalizer import from_bytes from charset_normalizer import from_bytes
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain""" """Anything with content type text/plain"""
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,

View file

@ -10,7 +10,7 @@ from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._schemas import StreamInfo
import pptx import pptx
@ -58,7 +58,8 @@ class PptxConverter(DocumentConverter):
# Also grab any description embedded in the deck # Also grab any description embedded in the deck
try: try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
"descr", "")
except Exception: except Exception:
# Unable to get alt text # Unable to get alt text
pass pass
@ -75,10 +76,10 @@ class PptxConverter(DocumentConverter):
b64_string = base64.b64encode(blob).decode("utf-8") b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table, **kwargs) md_content += self._convert_table_to_markdown(
shape.table, **kwargs)
# Charts # Charts
if shape.has_chart: if shape.has_chart:
@ -93,7 +94,8 @@ class PptxConverter(DocumentConverter):
# Group Shapes # Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) sorted_shapes = sorted(
shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes: for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs) get_shape_content(subshape, **kwargs)
@ -141,7 +143,8 @@ class PptxConverter(DocumentConverter):
html_table += "</table></body></html>" html_table += "</table></body></html>"
return ( return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip() self._html_converter.convert_string(
html_table, **kwargs).markdown.strip()
+ "\n" + "\n"
) )

View file

@ -3,7 +3,7 @@ from typing import BinaryIO, Any
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._stream_info import StreamInfo from .._schemas import StreamInfo
# Try loading optional (but in this case, required) dependencies # Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later # Save reporting of any exceptions for later