From 993b5fc25873b73e2fb75848fc789ec5fb4f4e0e Mon Sep 17 00:00:00 2001 From: abdeladim-s Date: Fri, 20 Dec 2024 17:13:30 -0500 Subject: [PATCH] feat(converter): add video converter. --- src/markitdown/_markitdown.py | 145 +++++++++++++++++++++++++++++++++- 1 file changed, 144 insertions(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 040a586..8000afc 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -495,7 +495,9 @@ class YouTubeConverter(DocumentConverter): "youtube_transcript_languages", ("en",) ) # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore + transcript = YouTubeTranscriptApi.get_transcript( + video_id, languages=youtube_transcript_languages + ) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: # formatter = TextFormatter() @@ -1075,6 +1077,146 @@ class ImageConverter(MediaConverter): return response.choices[0].message.content +class VideoConverter(WavConverter): + """ + Converts videos to markdown via: + * extraction of metadata (if `exiftool` is installed) + * speech transcription (if `speech_recognition` AND `pydub` are installed). + * summary via a multimodal LLM if a transcription is available and a llm_client is configured + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + """ + Convert a video to markdown + + Args: + local_path (str): The path to the video file + metadata_exclude: A list of metadata fields to exclude from the extracted exif metadata + metadata_title: The title of the metadata section + transcribe: Whether to transcribe the video + transcript_title: The title of the transcript section + llm_summary: Whether to generate a summary via the provided multimodal LLM client + llm_summary_title: The title of the summary section + """ + + mime_type = mimetypes.guess_type(local_path)[0] + if mime_type is None or not mime_type.startswith("video/"): + return None + + md_content = "" + + # Add metadata, let the user exclude metadata they don't want + metadata = self._get_metadata(local_path) + # Exclude these metadat by default (but allow the user to override) + # Maybe this should be moved to somewhere else + DEFAULTS_METADATA_EXCLUDE = [ + "SourceFile", + "ExifToolVersion", + "Directory", + "FileModifyDate", + "FileAccessDate", + "FileInodeChangeDate", + "FilePermissions", + ] + metadata_exclude = kwargs.get("metadata_exclude", DEFAULTS_METADATA_EXCLUDE) + metadata_title = kwargs.get("metadata_title", "### Metadata:\n") + if metadata_title is not None: + md_content += metadata_title + for f in metadata: + if not f in metadata_exclude: + md_content += f"{f}: {metadata[f]}\n" + + # Transcribe + transcribe = kwargs.get("transcribe", True) + transcript = "" + if transcribe and IS_AUDIO_TRANSCRIPTION_CAPABLE: + handle, temp_path = tempfile.mkstemp(suffix=".wav") + os.close(handle) + try: + sound = pydub.AudioSegment.from_file(local_path) + with open(temp_path, "wb") as f: + sound.export(f, format="wav") + _args = dict() + _args.update(kwargs) + _args["file_extension"] = ".wav" + + transcript_title = kwargs.get( + "transcript_title", "\n\n### Transcript:\n" + ) + try: + transcript = super()._transcribe_audio(temp_path).strip() + md_content += transcript_title + ( + "[No speech detected]" if transcript == "" else transcript + ) + except Exception: + transcript_error = kwargs.get( + "transcript_error", "Error. Could not transcribe." + ) + md_content += f"{transcript_title}{transcript_error}" + + finally: + os.unlink(temp_path) + + # LLM analysis (Optional) / not all LLMs are fully capable of analyzing video files yet, + # But for now we can use the transcript to get a summary of its content + llm_summary = kwargs.get("llm_summary", True) + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_summary and llm_client is not None and llm_model is not None: + if not transcribe: + print("Error: LLM summary requires transcription to be enabled.") + elif transcript == "": + print("Warning: No transcript found. Skipping LLM summary.") + else: + llm_summary_title = kwargs.get( + "llm_summary_title", "\n\n### Video Summary:\n" + ) + md_content += ( + llm_summary_title + + self._get_llm_video_summary_from_transcript( + transcript, + llm_client, + llm_model, + prompt=kwargs.get("llm_prompt"), + ) + ) + + # Return the result + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + def _get_llm_video_summary_from_transcript( + self, transcript, client, model, prompt=None + ) -> str: + """ + helper function to get a summary of the video content from the transcript + + Args: + transcript: the transcript of the video + client: the llm client + model: the llm model + prompt: the prompt to use + Returns: the summary + """ + if prompt is None or prompt.strip() == "": + prompt = "The following is video transcript, based on it, write a summary of the video content:\n" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "text", "text": transcript}, + ], + } + ] + + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1281,6 +1423,7 @@ class MarkItDown: self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) self.register_page_converter(ImageConverter()) + self.register_page_converter(VideoConverter()) self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter())