finished audio transcription
This commit is contained in:
parent
03f3fa9829
commit
f33a0ed922
10 changed files with 105 additions and 135 deletions
|
|
@ -9,7 +9,6 @@ from ._markitup import (
|
||||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||||
from ._schemas import StreamInfo, Config
|
from ._schemas import StreamInfo, Config
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItUpException,
|
|
||||||
MissingDependencyException,
|
MissingDependencyException,
|
||||||
FailedConversionAttempt,
|
FailedConversionAttempt,
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
|
|
@ -21,7 +20,6 @@ __all__ = [
|
||||||
"MarkItUp",
|
"MarkItUp",
|
||||||
"DocumentConverter",
|
"DocumentConverter",
|
||||||
"DocumentConverterResult",
|
"DocumentConverterResult",
|
||||||
"MarkItUpException",
|
|
||||||
"MissingDependencyException",
|
"MissingDependencyException",
|
||||||
"FailedConversionAttempt",
|
"FailedConversionAttempt",
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from warnings import warn
|
||||||
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
from typing import Any, Union, BinaryIO, Optional, List, Dict
|
||||||
from ._schemas import StreamInfo
|
from ._schemas import StreamInfo
|
||||||
import re
|
import re
|
||||||
|
import base64
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverterResult:
|
class DocumentConverterResult:
|
||||||
|
|
@ -11,9 +12,11 @@ class DocumentConverterResult:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
markdown: str,
|
markdown: str = "",
|
||||||
*,
|
*,
|
||||||
title: Optional[str] = None,
|
title: Optional[str] = None,
|
||||||
|
audio_stream: Optional[BinaryIO] = None,
|
||||||
|
stream_info: Optional[StreamInfo] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the DocumentConverterResult.
|
Initialize the DocumentConverterResult.
|
||||||
|
|
@ -26,7 +29,9 @@ class DocumentConverterResult:
|
||||||
- title: Optional title of the document.
|
- title: Optional title of the document.
|
||||||
"""
|
"""
|
||||||
self.markdown = markdown
|
self.markdown = markdown
|
||||||
|
self.audio_stream = audio_stream
|
||||||
self.title = title
|
self.title = title
|
||||||
|
self.stream_info = stream_info
|
||||||
|
|
||||||
def to_llm(self) -> List[Dict[str, Any]]:
|
def to_llm(self) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -79,7 +84,14 @@ class DocumentConverterResult:
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": text_chunk
|
"text": text_chunk
|
||||||
})
|
})
|
||||||
|
if self.audio_stream:
|
||||||
|
audio_b64 = base64.b64encode(
|
||||||
|
self.audio_stream.read()).decode('utf-8')
|
||||||
|
content.append({
|
||||||
|
"type": "media",
|
||||||
|
"mime_type": self.stream_info.magic_type,
|
||||||
|
"data": audio_b64
|
||||||
|
})
|
||||||
return content
|
return content
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -104,7 +116,7 @@ class DocumentConverter:
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
** kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
"""
|
"""
|
||||||
Convert a document to Markdown text.
|
Convert a document to Markdown text.
|
||||||
|
|
|
||||||
|
|
@ -8,15 +8,7 @@ MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential
|
||||||
* etc."""
|
* etc."""
|
||||||
|
|
||||||
|
|
||||||
class MarkItUpException(Exception):
|
class MissingDependencyException(Exception):
|
||||||
"""
|
|
||||||
Base exception class for MarkItUp.
|
|
||||||
"""
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MissingDependencyException(MarkItUpException):
|
|
||||||
"""
|
"""
|
||||||
Converters shipped with MarkItUp may depend on optional
|
Converters shipped with MarkItUp may depend on optional
|
||||||
dependencies. This exception is thrown when a converter's
|
dependencies. This exception is thrown when a converter's
|
||||||
|
|
@ -31,7 +23,7 @@ class MissingDependencyException(MarkItUpException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class UnsupportedFormatException(MarkItUpException):
|
class UnsupportedFormatException(Exception):
|
||||||
"""
|
"""
|
||||||
Thrown when no suitable converter was found for the given file.
|
Thrown when no suitable converter was found for the given file.
|
||||||
"""
|
"""
|
||||||
|
|
@ -39,17 +31,16 @@ class UnsupportedFormatException(MarkItUpException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class FailedConversionAttempt(object):
|
class FailedConversionAttempt(Exception):
|
||||||
"""
|
"""
|
||||||
Represents an a single attempt to convert a file.
|
Represents a single attempt to convert a file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
|
def __init__(self):
|
||||||
self.converter = converter
|
super().__init__(f"Conversion attempt failed!")
|
||||||
self.exc_info = exc_info
|
|
||||||
|
|
||||||
|
|
||||||
class FileConversionException(MarkItUpException):
|
class FileConversionException(Exception):
|
||||||
"""
|
"""
|
||||||
Thrown when a suitable converter was found, but the conversion
|
Thrown when a suitable converter was found, but the conversion
|
||||||
process fails for any reason.
|
process fails for any reason.
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ from .converters import (
|
||||||
XlsxConverter,
|
XlsxConverter,
|
||||||
XlsConverter,
|
XlsConverter,
|
||||||
PptxConverter,
|
PptxConverter,
|
||||||
# AudioConverter,
|
AudioConverter,
|
||||||
CsvConverter,
|
CsvConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -53,9 +53,11 @@ class MarkItUp:
|
||||||
case "text":
|
case "text":
|
||||||
return PlainTextConverter().convert(stream, stream_info), stream_info
|
return PlainTextConverter().convert(stream, stream_info), stream_info
|
||||||
case "pptx":
|
case "pptx":
|
||||||
return PptxConverter().convert(stream, stream_info), stream_info
|
return PptxConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
case "pdf":
|
case "pdf":
|
||||||
return PdfConverter().convert(stream, stream_info), stream_info
|
return PdfConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
|
case "audio":
|
||||||
|
return AudioConverter(config=self.config).convert(stream, stream_info), stream_info
|
||||||
except FailedConversionAttempt:
|
except FailedConversionAttempt:
|
||||||
raise FileConversionException(
|
raise FileConversionException(
|
||||||
f"Failed to convert file of type {stream_info.magic_type}")
|
f"Failed to convert file of type {stream_info.magic_type}")
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,6 @@ class StreamInfo:
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Config:
|
class Config:
|
||||||
modality: List[Literal["image", "audio"]] = field(
|
modalities: List[Literal["image", "audio"]] = field(
|
||||||
default_factory=lambda: ["image", "audio"]
|
default_factory=lambda: ["image", "audio"]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,10 @@ import os
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from markitup._schemas import StreamInfo
|
from markitup._schemas import StreamInfo
|
||||||
import magic
|
import magic
|
||||||
|
import speech_recognition as sr
|
||||||
|
import pydub
|
||||||
|
import io
|
||||||
|
from typing import BinaryIO
|
||||||
|
|
||||||
|
|
||||||
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
|
def read_files_to_bytestreams(folder_path="packages/markitup/tests/test_files"):
|
||||||
|
|
@ -100,3 +104,25 @@ def detect_file_types(file_dict):
|
||||||
byte_stream.seek(original_position)
|
byte_stream.seek(original_position)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_audio(file_stream: BinaryIO, *, magic_type: str = "audio/mpeg") -> str:
|
||||||
|
audio_format = 'mp3' if magic_type == 'audio/mpeg' else 'wav' if magic_type == 'audio/x-wav' else None
|
||||||
|
|
||||||
|
match audio_format:
|
||||||
|
case 'mp3':
|
||||||
|
audio_segment = pydub.AudioSegment.from_file(
|
||||||
|
file_stream, format=audio_format)
|
||||||
|
audio_source = io.BytesIO()
|
||||||
|
audio_segment.export(audio_source, format="wav")
|
||||||
|
audio_source.seek(0)
|
||||||
|
case 'wav':
|
||||||
|
audio_source = file_stream
|
||||||
|
case _:
|
||||||
|
raise ValueError(f"Unsupported audio format: {magic_type}")
|
||||||
|
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
with sr.AudioFile(audio_source) as source:
|
||||||
|
audio = recognizer.record(source)
|
||||||
|
transcript = recognizer.recognize_google(audio).strip()
|
||||||
|
return "[No speech detected]" if transcript == "" else transcript
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from ._pdf_converter import PdfConverter
|
||||||
from ._docx_converter import DocxConverter
|
from ._docx_converter import DocxConverter
|
||||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||||
from ._pptx_converter import PptxConverter
|
from ._pptx_converter import PptxConverter
|
||||||
# from ._audio_converter import AudioConverter
|
from ._audio_converter import AudioConverter
|
||||||
from ._csv_converter import CsvConverter
|
from ._csv_converter import CsvConverter
|
||||||
from ._markdownify import _CustomMarkdownify
|
from ._markdownify import _CustomMarkdownify
|
||||||
|
|
||||||
|
|
@ -27,7 +27,7 @@ __all__ = [
|
||||||
"XlsConverter",
|
"XlsConverter",
|
||||||
"PptxConverter",
|
"PptxConverter",
|
||||||
"ImageConverter",
|
"ImageConverter",
|
||||||
# "AudioConverter",
|
"AudioConverter",
|
||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
|
|
||||||
|
|
@ -1,23 +1,10 @@
|
||||||
import io
|
import io
|
||||||
from typing import Any, BinaryIO, Optional
|
from typing import Any, BinaryIO, Optional, Tuple
|
||||||
|
|
||||||
from ._exiftool import exiftool_metadata
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._schemas import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
from .._exceptions import MissingDependencyException
|
from .._exceptions import MissingDependencyException
|
||||||
|
from ..converter_utils.utils import transcribe_audio
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
||||||
"audio/x-wav",
|
|
||||||
"audio/mpeg",
|
|
||||||
"video/mp4",
|
|
||||||
]
|
|
||||||
|
|
||||||
ACCEPTED_FILE_EXTENSIONS = [
|
|
||||||
".wav",
|
|
||||||
".mp3",
|
|
||||||
".m4a",
|
|
||||||
".mp4",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class AudioConverter(DocumentConverter):
|
class AudioConverter(DocumentConverter):
|
||||||
|
|
@ -25,78 +12,25 @@ class AudioConverter(DocumentConverter):
|
||||||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def accepts(
|
def __init__(self, config: Config):
|
||||||
self,
|
self.config = config
|
||||||
file_stream: BinaryIO,
|
|
||||||
stream_info: StreamInfo,
|
|
||||||
**kwargs: Any, # Options to pass to the converter
|
|
||||||
) -> bool:
|
|
||||||
mimetype = (stream_info.mimetype or "").lower()
|
|
||||||
extension = (stream_info.extension or "").lower()
|
|
||||||
|
|
||||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
||||||
return True
|
|
||||||
|
|
||||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
||||||
if mimetype.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
stream_info: StreamInfo,
|
stream_info: StreamInfo,
|
||||||
**kwargs: Any, # Options to pass to the converter
|
** kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> Tuple[DocumentConverterResult, StreamInfo]:
|
||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
|
||||||
metadata = exiftool_metadata(
|
|
||||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
|
||||||
)
|
|
||||||
if metadata:
|
|
||||||
for f in [
|
|
||||||
"Title",
|
|
||||||
"Artist",
|
|
||||||
"Author",
|
|
||||||
"Band",
|
|
||||||
"Album",
|
|
||||||
"Genre",
|
|
||||||
"Track",
|
|
||||||
"DateTimeOriginal",
|
|
||||||
"CreateDate",
|
|
||||||
# "Duration", -- Wrong values when read from memory
|
|
||||||
"NumChannels",
|
|
||||||
"SampleRate",
|
|
||||||
"AvgBytesPerSec",
|
|
||||||
"BitsPerSample",
|
|
||||||
]:
|
|
||||||
if f in metadata:
|
|
||||||
md_content += f"{f}: {metadata[f]}\n"
|
|
||||||
|
|
||||||
# Figure out the audio format for transcription
|
|
||||||
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
|
|
||||||
audio_format = "wav"
|
|
||||||
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
|
|
||||||
audio_format = "mp3"
|
|
||||||
elif (
|
|
||||||
stream_info.extension in [".mp4", ".m4a"]
|
|
||||||
or stream_info.mimetype == "video/mp4"
|
|
||||||
):
|
|
||||||
audio_format = "mp4"
|
|
||||||
else:
|
|
||||||
audio_format = None
|
|
||||||
|
|
||||||
# Transcribe
|
# Transcribe
|
||||||
if audio_format:
|
if 'audio' not in self.config.modalities:
|
||||||
try:
|
transcript = transcribe_audio(
|
||||||
transcript = transcribe_audio(
|
file_stream, magic_type=stream_info.magic_type)
|
||||||
file_stream, audio_format=audio_format)
|
if transcript:
|
||||||
if transcript:
|
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
return DocumentConverterResult(markdown=md_content.strip())
|
||||||
except MissingDependencyException:
|
else:
|
||||||
pass
|
return DocumentConverterResult(audio_stream=file_stream, stream_info=stream_info)
|
||||||
|
|
||||||
# Return the result
|
# Return the result
|
||||||
return DocumentConverterResult(markdown=md_content.strip())
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ import io
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._schemas import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
|
|
@ -13,6 +13,9 @@ class PdfConverter(DocumentConverter):
|
||||||
Converts PDFs to Markdown with embedded images.
|
Converts PDFs to Markdown with embedded images.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_stream: BinaryIO,
|
file_stream: BinaryIO,
|
||||||
|
|
@ -36,28 +39,28 @@ class PdfConverter(DocumentConverter):
|
||||||
|
|
||||||
# Extract images from the page
|
# Extract images from the page
|
||||||
image_list = page.get_images(full=True)
|
image_list = page.get_images(full=True)
|
||||||
|
if 'image' in self.config.modalities:
|
||||||
|
for img_index, img_info in enumerate(image_list):
|
||||||
|
xref = img_info[0] # Get the image reference
|
||||||
|
base_image = doc.extract_image(xref)
|
||||||
|
|
||||||
for img_index, img_info in enumerate(image_list):
|
if base_image:
|
||||||
xref = img_info[0] # Get the image reference
|
image_bytes = base_image["image"]
|
||||||
base_image = doc.extract_image(xref)
|
image_ext = base_image["ext"]
|
||||||
|
|
||||||
if base_image:
|
|
||||||
image_bytes = base_image["image"]
|
|
||||||
image_ext = base_image["ext"]
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Convert image to base64 for markdown embedding
|
|
||||||
img_base64 = base64.b64encode(
|
|
||||||
image_bytes).decode('utf-8')
|
|
||||||
# Add image to markdown with a unique identifier
|
|
||||||
image_count += 1
|
|
||||||
markdown_content += f"\n\n"
|
|
||||||
except Exception as e:
|
|
||||||
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Convert image to base64 for markdown embedding
|
||||||
|
img_base64 = base64.b64encode(
|
||||||
|
image_bytes).decode('utf-8')
|
||||||
|
# Add image to markdown with a unique identifier
|
||||||
|
image_count += 1
|
||||||
|
markdown_content += f"\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
markdown_content += f"*[Error processing image {image_count}: {str(e)}]*\n\n"
|
||||||
|
else:
|
||||||
|
markdown_content += f"{len(image_list)} images not shown here due to model not supporting image input\n\n"
|
||||||
# Close the document to free resources
|
# Close the document to free resources
|
||||||
doc.close()
|
doc.close()
|
||||||
print(markdown_content)
|
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
markdown=markdown_content,
|
markdown=markdown_content,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from operator import attrgetter
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._schemas import StreamInfo
|
from .._schemas import StreamInfo, Config
|
||||||
import pptx
|
import pptx
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -26,9 +26,9 @@ class PptxConverter(DocumentConverter):
|
||||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config: Config):
|
||||||
super().__init__()
|
|
||||||
self._html_converter = HtmlConverter()
|
self._html_converter = HtmlConverter()
|
||||||
|
self.config = config
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
|
|
@ -70,11 +70,15 @@ class PptxConverter(DocumentConverter):
|
||||||
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
alt_text = re.sub(r"\s+", " ", alt_text).strip()
|
||||||
|
|
||||||
# If keep_data_uris is True, use base64 encoding for images
|
# If keep_data_uris is True, use base64 encoding for images
|
||||||
|
if 'image' in self.config.modalities:
|
||||||
blob = shape.image.blob
|
blob = shape.image.blob
|
||||||
content_type = shape.image.content_type or "image/png"
|
content_type = shape.image.content_type or "image/png"
|
||||||
b64_string = base64.b64encode(blob).decode("utf-8")
|
b64_string = base64.b64encode(blob).decode("utf-8")
|
||||||
md_content += f"\n\n"
|
md_content += f"\n\n"
|
||||||
|
else:
|
||||||
|
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
||||||
|
md_content += "\n\n"
|
||||||
|
|
||||||
# Tables
|
# Tables
|
||||||
if self._is_table(shape):
|
if self._is_table(shape):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue