Merge aa4a073c24 into 125e206047

2024-12-23 19:20:11 +00:00 · 2024-12-23 19:20:11 +00:00 · 98c7a0ed23
commit 98c7a0ed23
parent 125e206047 aa4a073c24
3 changed files with 158 additions and 1 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -496,7 +496,9 @@ class YouTubeConverter(DocumentConverter):
                        "youtube_transcript_languages", ("en",)
                    )
                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
+                    transcript = YouTubeTranscriptApi.get_transcript(
+                        video_id, languages=youtube_transcript_languages
+                    )  # type: ignore
                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
@ -1076,6 +1078,146 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content


+class VideoConverter(WavConverter):
+    """
+    Converts videos to markdown via:
+        * extraction of metadata (if `exiftool` is installed)
+        * speech transcription (if `speech_recognition` AND `pydub` are installed).
+        * summary via a multimodal LLM if a transcription is available and a llm_client is configured
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        """
+        Convert a video to markdown
+
+        Args:
+            local_path (str): The path to the video file
+            metadata_exclude: A list of metadata fields to exclude from the extracted exif metadata
+            metadata_title: The title of the metadata section
+            transcribe: Whether to transcribe the video
+            transcript_title: The title of the transcript section
+            llm_summary: Whether to generate a summary via the provided multimodal LLM client
+            llm_summary_title: The title of the summary section
+        """
+
+        mime_type = mimetypes.guess_type(local_path)[0]
+        if mime_type is None or not mime_type.startswith("video/"):
+            return None
+
+        md_content = ""
+
+        # Add metadata, let the user exclude metadata they don't want
+        metadata = self._get_metadata(local_path)
+        # Exclude these metadat by default (but allow the user to override)
+        # Maybe this should be moved to somewhere else
+        DEFAULTS_METADATA_EXCLUDE = [
+            "SourceFile",
+            "ExifToolVersion",
+            "Directory",
+            "FileModifyDate",
+            "FileAccessDate",
+            "FileInodeChangeDate",
+            "FilePermissions",
+        ]
+        metadata_exclude = kwargs.get("metadata_exclude", DEFAULTS_METADATA_EXCLUDE)
+        metadata_title = kwargs.get("metadata_title", "### Metadata:\n")
+        if metadata_title is not None:
+            md_content += metadata_title
+        for f in metadata:
+            if not f in metadata_exclude:
+                md_content += f"{f}: {metadata[f]}\n"
+
+        # Transcribe
+        transcribe = kwargs.get("transcribe", True)
+        transcript = ""
+        if transcribe and IS_AUDIO_TRANSCRIPTION_CAPABLE:
+            handle, temp_path = tempfile.mkstemp(suffix=".wav")
+            os.close(handle)
+            try:
+                sound = pydub.AudioSegment.from_file(local_path)
+                with open(temp_path, "wb") as f:
+                    sound.export(f, format="wav")
+                _args = dict()
+                _args.update(kwargs)
+                _args["file_extension"] = ".wav"
+
+                transcript_title = kwargs.get(
+                    "transcript_title", "\n\n### Transcript:\n"
+                )
+                try:
+                    transcript = super()._transcribe_audio(temp_path).strip()
+                    md_content += transcript_title + (
+                        "[No speech detected]" if transcript == "" else transcript
+                    )
+                except Exception:
+                    transcript_error = kwargs.get(
+                        "transcript_error", "Error. Could not transcribe."
+                    )
+                    md_content += f"{transcript_title}{transcript_error}"
+
+            finally:
+                os.unlink(temp_path)
+
+        # LLM analysis (Optional) / not all LLMs are fully capable of analyzing video files yet,
+        # But for now we can use the transcript to get a summary of its content
+        llm_summary = kwargs.get("llm_summary", True)
+        llm_client = kwargs.get("llm_client")
+        llm_model = kwargs.get("llm_model")
+        if llm_summary and llm_client is not None and llm_model is not None:
+            if not transcribe:
+                print("Error: LLM summary requires transcription to be enabled.")
+            elif transcript == "":
+                print("Warning: No transcript found. Skipping LLM summary.")
+            else:
+                llm_summary_title = kwargs.get(
+                    "llm_summary_title", "\n\n### Video Summary:\n"
+                )
+                md_content += (
+                    llm_summary_title
+                    + self._get_llm_video_summary_from_transcript(
+                        transcript,
+                        llm_client,
+                        llm_model,
+                        prompt=kwargs.get("llm_prompt"),
+                    )
+                )
+
+        # Return the result
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+    def _get_llm_video_summary_from_transcript(
+        self, transcript, client, model, prompt=None
+    ) -> str:
+        """
+        helper function to get a summary of the video content from the transcript
+
+        Args:
+            transcript: the transcript of the video
+            client: the llm client
+            model: the llm model
+            prompt: the prompt to use
+        Returns: the summary
+        """
+        if prompt is None or prompt.strip() == "":
+            prompt = "The following is video transcript, based on it, write a summary of the video content:\n"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "text", "text": transcript},
+                ],
+            }
+        ]
+
+        response = client.chat.completions.create(model=model, messages=messages)
+        return response.choices[0].message.content
+
+
 class ZipConverter(DocumentConverter):
    """Converts ZIP files to markdown by extracting and converting all contained files.

@ -1282,6 +1424,7 @@ class MarkItDown:
        self.register_page_converter(WavConverter())
        self.register_page_converter(Mp3Converter())
        self.register_page_converter(ImageConverter())
+        self.register_page_converter(VideoConverter())
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
--- a/tests/test_files/test.mp4
+++ b/tests/test_files/test.mp4
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -130,6 +130,12 @@ LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]

+VIDEO_TEST_EXIFTOOL = {
+    "Title": "Sample video test for MarkItDown",
+    "Comment": "This is a sample video created using FFmpeg, with the voice-over generated by the Parler-TTS model.",
+    "ImageSize": "1280x720",
+}
+

 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
@ -246,6 +252,14 @@ def test_markitdown_exiftool() -> None:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
        assert target in result.text_content

+    # Test Video metadata
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test.mp4"), transcribe=False, llm_summary=False
+    )
+    for key in VIDEO_TEST_EXIFTOOL:
+        target = f"{key}: {VIDEO_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+

 def test_markitdown_deprecation() -> None:
    try: