finished audio transcription

This commit is contained in:
rong-xyz 2025-04-22 09:30:07 +00:00
parent 03f3fa9829
commit f33a0ed922
10 changed files with 105 additions and 135 deletions

View file

@ -9,7 +9,6 @@ from ._markitup import (
from ._base_converter import DocumentConverterResult, DocumentConverter from ._base_converter import DocumentConverterResult, DocumentConverter
from ._schemas import StreamInfo, Config from ._schemas import StreamInfo, Config
from ._exceptions import ( from ._exceptions import (
MarkItUpException,
MissingDependencyException, MissingDependencyException,
FailedConversionAttempt, FailedConversionAttempt,
FileConversionException, FileConversionException,
@ -21,7 +20,6 @@ __all__ = [
"MarkItUp", "MarkItUp",
"DocumentConverter", "DocumentConverter",
"DocumentConverterResult", "DocumentConverterResult",
"MarkItUpException",
"MissingDependencyException", "MissingDependencyException",
"FailedConversionAttempt", "FailedConversionAttempt",
"FileConversionException", "FileConversionException",

View file

@ -4,6 +4,7 @@ from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List, Dict from typing import Any, Union, BinaryIO, Optional, List, Dict
from ._schemas import StreamInfo from ._schemas import StreamInfo
import re import re
import base64
class DocumentConverterResult: class DocumentConverterResult:
@ -11,9 +12,11 @@ class DocumentConverterResult:
def __init__( def __init__(
self, self,
markdown: str, markdown: str = "",
*, *,
title: Optional[str] = None, title: Optional[str] = None,
audio_stream: Optional[BinaryIO] = None,
stream_info: Optional[StreamInfo] = None,
): ):
""" """
Initialize the DocumentConverterResult. Initialize the DocumentConverterResult.
@ -26,7 +29,9 @@ class DocumentConverterResult:
- title: Optional title of the document. - title: Optional title of the document.
""" """
self.markdown = markdown self.markdown = markdown
self.audio_stream = audio_stream
self.title = title self.title = title
self.stream_info = stream_info
def to_llm(self) -> List[Dict[str, Any]]: def to_llm(self) -> List[Dict[str, Any]]:
""" """
@ -79,7 +84,14 @@ class DocumentConverterResult:
"type": "text", "type": "text",
"text": text_chunk "text": text_chunk
}) })
if self.audio_stream:
audio_b64 = base64.b64encode(
self.audio_stream.read()).decode('utf-8')
content.append({
"type": "media",
"mime_type": self.stream_info.magic_type,
"data": audio_b64
})
return content return content
@property @property

View file

@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential
* etc.""" * etc."""
class MarkItUpException(Exception): class MissingDependencyException(Exception):
"""
Base exception class for MarkItUp.
"""
pass
class MissingDependencyException(MarkItUpException):
""" """
Converters shipped with MarkItUp may depend on optional Converters shipped with MarkItUp may depend on optional
dependencies. This exception is thrown when a converter's dependencies. This exception is thrown when a converter's
@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException):
pass pass
class UnsupportedFormatException(MarkItUpException): class UnsupportedFormatException(Exception):
""" """
Thrown when no suitable converter was found for the given file. Thrown when no suitable converter was found for the given file.
""" """
@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException):
pass pass
class FailedConversionAttempt(object): class FailedConversionAttempt(Exception):
""" """
Represents an a single attempt to convert a file. Represents a single attempt to convert a file.
""" """
def __init__(self, converter: Any, exc_info: Optional[tuple] = None): def __init__(self):
self.converter = converter super().__init__(f"Conversion attempt failed!")
self.exc_info = exc_info
class FileConversionException(MarkItUpException): class FileConversionException(Exception):
""" """
Thrown when a suitable converter was found, but the conversion Thrown when a suitable converter was found, but the conversion
process fails for any reason. process fails for any reason.

View file

@ -14,7 +14,7 @@ from .converters import (
XlsxConverter, XlsxConverter,
XlsConverter, XlsConverter,
PptxConverter, PptxConverter,
# AudioConverter, AudioConverter,
CsvConverter, CsvConverter,
) )
@ -53,9 +53,11 @@ class MarkItUp:
case "text": case "text":
return PlainTextConverter().convert(stream, stream_info), stream_info return PlainTextConverter().convert(stream, stream_info), stream_info
case "pptx": case "pptx":
return PptxConverter().convert(stream, stream_info), stream_info return PptxConverter(config=self.config).convert(stream, stream_info), stream_info
case "pdf": case "pdf":
return PdfConverter().convert(stream, stream_info), stream_info return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
case "audio":
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
except FailedConversionAttempt: except FailedConversionAttempt:
raise FileConversionException( raise FileConversionException(
f"Failed to convert file of type {stream_info.magic_type}") f"Failed to convert file of type {stream_info.magic_type}")

View file

@ -10,6 +10,6 @@ class StreamInfo:
@dataclass @dataclass
class Config: class Config:
modality: List[Literal["image", "audio"]] = field( modalities: List[Literal["image", "audio"]] = field(
default_factory=lambda: ["image", "audio"] default_factory=lambda: ["image", "audio"]
) )

View file

@ -2,6 +2,10 @@ import os
from io import BytesIO from io import BytesIO
from markitup._schemas import StreamInfo from markitup._schemas import StreamInfo
import magic import magic
import speech_recognition as sr
import pydub
import io
from typing import BinaryIO
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"): def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
@ -100,3 +104,25 @@ def detect_file_types(file_dict):
byte_stream.seek(original_position) byte_stream.seek(original_position)
return result return result
def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None
match audio_format:
case 'mp3':
audio_segment = pydub.AudioSegment.from_file(
file_stream, format=audio_format)
audio_source = io.BytesIO()
audio_segment.export(audio_source, format="wav")
audio_source.seek(0)
case 'wav':
audio_source = file_stream
case _:
raise ValueError(f"Unsupported audio format: {magic_type}")
recognizer = sr.Recognizer()
with sr.AudioFile(audio_source) as source:
audio = recognizer.record(source)
transcript = recognizer.recognize_google(audio).strip()
return "[No speech detected]" if transcript == "" else transcript

View file

@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter from ._docx_converter import DocxConverter
from ._xlsx_converter import XlsxConverter, XlsConverter from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter from ._pptx_converter import PptxConverter
# from ._audio_converter import AudioConverter from ._audio_converter import AudioConverter
from ._csv_converter import CsvConverter from ._csv_converter import CsvConverter
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
@ -27,7 +27,7 @@ __all__ = [
"XlsConverter", "XlsConverter",
"PptxConverter", "PptxConverter",
"ImageConverter", "ImageConverter",
# "AudioConverter", "AudioConverter",
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",

View file

@ -1,23 +1,10 @@
import io import io
from typing import Any, BinaryIO, Optional from typing import Any, BinaryIO, Optional, Tuple
from ._exiftool import exiftool_metadata
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo from .._schemas import StreamInfo, Config
from .._exceptions import MissingDependencyException from .._exceptions import MissingDependencyException
from ..converter_utils.utils import transcribe_audio
ACCEPTED_MIME_TYPE_PREFIXES = [
"audio/x-wav",
"audio/mpeg",
"video/mp4",
]
ACCEPTED_FILE_EXTENSIONS = [
".wav",
".mp3",
".m4a",
".mp4",
]
class AudioConverter(DocumentConverter): class AudioConverter(DocumentConverter):
@ -25,78 +12,25 @@ class AudioConverter(DocumentConverter):
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
""" """
def accepts( def __init__(self, config: Config):
self, self.config = config
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
stream_info: StreamInfo, stream_info: StreamInfo,
** kwargs: Any, # Options to pass to the converter ** kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> Tuple[DocumentConverterResult, StreamInfo]:
md_content = "" md_content = ""
# Add metadata
metadata = exiftool_metadata(
file_stream, exiftool_path=kwargs.get("exiftool_path")
)
if metadata:
for f in [
"Title",
"Artist",
"Author",
"Band",
"Album",
"Genre",
"Track",
"DateTimeOriginal",
"CreateDate",
# "Duration", -- Wrong values when read from memory
"NumChannels",
"SampleRate",
"AvgBytesPerSec",
"BitsPerSample",
]:
if f in metadata:
md_content += f"{f}: {metadata[f]}\n"
# Figure out the audio format for transcription
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
audio_format = "wav"
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
audio_format = "mp3"
elif (
stream_info.extension in [".mp4", ".m4a"]
or stream_info.mimetype == "video/mp4"
):
audio_format = "mp4"
else:
audio_format = None
# Transcribe # Transcribe
if audio_format: if 'audio' not in self.config.modalities:
try:
transcript = transcribe_audio( transcript = transcribe_audio(
file_stream, audio_format=audio_format) file_stream, magic_type=stream_info.magic_type)
if transcript: if transcript:
md_content += "\n\n### Audio Transcript:\n" + transcript md_content += "\n\n### Audio Transcript:\n" + transcript
except MissingDependencyException: return DocumentConverterResult(markdown=md_content.strip())
pass else:
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
# Return the result # Return the result
return DocumentConverterResult(markdown=md_content.strip())

View file

@ -3,7 +3,7 @@ import io
import base64 import base64
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo from .._schemas import StreamInfo, Config
import fitz import fitz
@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown with embedded images. Converts PDFs to Markdown with embedded images.
""" """
def __init__(self, config: Config):
self.config = config
def convert( def convert(
self, self,
file_stream: BinaryIO, file_stream: BinaryIO,
@ -36,7 +39,7 @@ class PdfConverter(DocumentConverter):
# Extract images from the page # Extract images from the page
image_list = page.get_images(full=True) image_list = page.get_images(full=True)
if 'image' in self.config.modalities:
for img_index, img_info in enumerate(image_list): for img_index, img_info in enumerate(image_list):
xref = img_info[0] # Get the image reference xref = img_info[0] # Get the image reference
base_image = doc.extract_image(xref) base_image = doc.extract_image(xref)
@ -54,10 +57,10 @@ class PdfConverter(DocumentConverter):
markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n" markdown_content += f"![Image {image_count}](data:image/{image_ext};base64,{img_base64})\n\n"
except Exception as e: except Exception as e:
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n" markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
else:
markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
# Close the document to free resources # Close the document to free resources
doc.close() doc.close()
print(markdown_content)
return DocumentConverterResult( return DocumentConverterResult(
markdown=markdown_content, markdown=markdown_content,
) )

View file

@ -10,7 +10,7 @@ from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._schemas import StreamInfo from .._schemas import StreamInfo, Config
import pptx import pptx
@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text. Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
""" """
def __init__(self): def __init__(self, config: Config):
super().__init__()
self._html_converter = HtmlConverter() self._html_converter = HtmlConverter()
self.config = config
def convert( def convert(
self, self,
@ -70,11 +70,15 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"\s+", " ", alt_text).strip() alt_text = re.sub(r"\s+", " ", alt_text).strip()
# If keep_data_uris is True, use base64 encoding for images # If keep_data_uris is True, use base64 encoding for images
if 'image' in self.config.modalities:
blob = shape.image.blob blob = shape.image.blob
content_type = shape.image.content_type or "image/png" content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8") b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
else:
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += "\n![" + alt_text + \
"](" + filename + ")\n"
# Tables # Tables
if self._is_table(shape): if self._is_table(shape):