Merge aa4a073c24 into 125e206047

2024-12-23 19:20:11 +00:00 · 2024-12-23 19:20:11 +00:00 · 98c7a0ed23
commit 98c7a0ed23
parent 125e206047 aa4a073c24
3 changed files with 158 additions and 1 deletions
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@ -496,7 +496,9 @@ class YouTubeConverter(DocumentConverter):
                        "youtube_transcript_languages", ("en",)
                    )
                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
+                    transcript = YouTubeTranscriptApi.get_transcript(
                        video_id, languages=youtube_transcript_languages
                    )  # type: ignore
                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
@ -1076,6 +1078,146 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content
 class VideoConverter(WavConverter):
    """
    Converts videos to markdown via:
        * extraction of metadata (if `exiftool` is installed)
        * speech transcription (if `speech_recognition` AND `pydub` are installed).
        * summary via a multimodal LLM if a transcription is available and a llm_client is configured
    """
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        """
        Convert a video to markdown
        Args:
            local_path (str): The path to the video file
            metadata_exclude: A list of metadata fields to exclude from the extracted exif metadata
            metadata_title: The title of the metadata section
            transcribe: Whether to transcribe the video
            transcript_title: The title of the transcript section
            llm_summary: Whether to generate a summary via the provided multimodal LLM client
            llm_summary_title: The title of the summary section
        """
        mime_type = mimetypes.guess_type(local_path)[0]
        if mime_type is None or not mime_type.startswith("video/"):
            return None
        md_content = ""
        # Add metadata, let the user exclude metadata they don't want
        metadata = self._get_metadata(local_path)
        # Exclude these metadat by default (but allow the user to override)
        # Maybe this should be moved to somewhere else
        DEFAULTS_METADATA_EXCLUDE = [
            "SourceFile",
            "ExifToolVersion",
            "Directory",
            "FileModifyDate",
            "FileAccessDate",
            "FileInodeChangeDate",
            "FilePermissions",
        ]
        metadata_exclude = kwargs.get("metadata_exclude", DEFAULTS_METADATA_EXCLUDE)
        metadata_title = kwargs.get("metadata_title", "### Metadata:\n")
        if metadata_title is not None:
            md_content += metadata_title
        for f in metadata:
            if not f in metadata_exclude:
                md_content += f"{f}: {metadata[f]}\n"
        # Transcribe
        transcribe = kwargs.get("transcribe", True)
        transcript = ""
        if transcribe and IS_AUDIO_TRANSCRIPTION_CAPABLE:
            handle, temp_path = tempfile.mkstemp(suffix=".wav")
            os.close(handle)
            try:
                sound = pydub.AudioSegment.from_file(local_path)
                with open(temp_path, "wb") as f:
                    sound.export(f, format="wav")
                _args = dict()
                _args.update(kwargs)
                _args["file_extension"] = ".wav"
                transcript_title = kwargs.get(
                    "transcript_title", "\n\n### Transcript:\n"
                )
                try:
                    transcript = super()._transcribe_audio(temp_path).strip()
                    md_content += transcript_title + (
                        "[No speech detected]" if transcript == "" else transcript
                    )
                except Exception:
                    transcript_error = kwargs.get(
                        "transcript_error", "Error. Could not transcribe."
                    )
                    md_content += f"{transcript_title}{transcript_error}"
            finally:
                os.unlink(temp_path)
        # LLM analysis (Optional) / not all LLMs are fully capable of analyzing video files yet,
        # But for now we can use the transcript to get a summary of its content
        llm_summary = kwargs.get("llm_summary", True)
        llm_client = kwargs.get("llm_client")
        llm_model = kwargs.get("llm_model")
        if llm_summary and llm_client is not None and llm_model is not None:
            if not transcribe:
                print("Error: LLM summary requires transcription to be enabled.")
            elif transcript == "":
                print("Warning: No transcript found. Skipping LLM summary.")
            else:
                llm_summary_title = kwargs.get(
                    "llm_summary_title", "\n\n### Video Summary:\n"
                )
                md_content += (
                    llm_summary_title
                    + self._get_llm_video_summary_from_transcript(
                        transcript,
                        llm_client,
                        llm_model,
                        prompt=kwargs.get("llm_prompt"),
                    )
                )
        # Return the result
        return DocumentConverterResult(
            title=None,
            text_content=md_content.strip(),
        )
    def _get_llm_video_summary_from_transcript(
        self, transcript, client, model, prompt=None
    ) -> str:
        """
        helper function to get a summary of the video content from the transcript
        Args:
            transcript: the transcript of the video
            client: the llm client
            model: the llm model
            prompt: the prompt to use
        Returns: the summary
        """
        if prompt is None or prompt.strip() == "":
            prompt = "The following is video transcript, based on it, write a summary of the video content:\n"
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "text", "text": transcript},
                ],
            }
        ]
        response = client.chat.completions.create(model=model, messages=messages)
        return response.choices[0].message.content
 class ZipConverter(DocumentConverter):
    """Converts ZIP files to markdown by extracting and converting all contained files.
@ -1282,6 +1424,7 @@ class MarkItDown:
        self.register_page_converter(WavConverter())
        self.register_page_converter(Mp3Converter())
        self.register_page_converter(ImageConverter())
        self.register_page_converter(VideoConverter())
        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
--- a/tests/test_files/test.mp4
+++ b/tests/test_files/test.mp4
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -130,6 +130,12 @@ LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]
 VIDEO_TEST_EXIFTOOL = {
    "Title": "Sample video test for MarkItDown",
    "Comment": "This is a sample video created using FFmpeg, with the voice-over generated by the Parler-TTS model.",
    "ImageSize": "1280x720",
 }
 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
@ -246,6 +252,14 @@ def test_markitdown_exiftool() -> None:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
        assert target in result.text_content
    # Test Video metadata
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test.mp4"), transcribe=False, llm_summary=False
    )
    for key in VIDEO_TEST_EXIFTOOL:
        target = f"{key}: {VIDEO_TEST_EXIFTOOL[key]}"
        assert target in result.text_content
 def test_markitdown_deprecation() -> None:
    try: