fix: implement retry logic for YouTube transcript fetching and fix URL decoding issue

2025-02-18 20:20:26 +01:00 · 2025-02-18 20:20:26 +01:00 · cc36fe9f0b
commit cc36fe9f0b
parent f712b63bf3
2 changed files with 34 additions and 10 deletions
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -1,5 +1,7 @@
 import re
 import json
 import urllib.parse
 import time
 from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
@ -25,6 +27,20 @@ class YouTubeConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
    def retry_operation(self, operation, retries=3, delay=2):
        """Retries the operation if it fails."""
        attempt = 0
        while attempt < retries:
            try:
                return operation()  # Attempt the operation
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < retries - 1:
                    time.sleep(delay)  # Wait before retrying
                attempt += 1
        # If all attempts fail, raise the last exception
        raise Exception(f"Operation failed after {retries} attempts.")
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
@ -33,6 +49,10 @@ class YouTubeConverter(DocumentConverter):
        if extension.lower() not in [".html", ".htm"]:
            return None
        url = kwargs.get("url", "")
        url = urllib.parse.unquote(url)
        url = url.replace(r"\?", "?").replace(r"\=", "=")
        if not url.startswith("https://www.youtube.com/watch?"):
            return None
@ -57,7 +77,7 @@ class YouTubeConverter(DocumentConverter):
                        metadata[meta[a]] = content
                    break
-        # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
+        # Try reading the description
        try:
            for script in soup(["script"]):
                if not script.string:  # Skip empty scripts
@ -114,10 +134,14 @@ class YouTubeConverter(DocumentConverter):
                    youtube_transcript_languages = kwargs.get(
                        "youtube_transcript_languages", ("en",)
                    )
-                    # Must be a single transcript.
+                    # Retry the transcript fetching operation
-                    transcript = YouTubeTranscriptApi.get_transcript(
+                    transcript = self.retry_operation(
-                        video_id, languages=youtube_transcript_languages
+                        lambda: YouTubeTranscriptApi.get_transcript(
-                    )  # type: ignore
+                            video_id, languages=youtube_transcript_languages
                        ),
                        retries=3,  # Retry 3 times
                        delay=2,  # 2 seconds delay between retries
                    )
                    if transcript:
                        transcript_text = " ".join(
                            [part["text"] for part in transcript]
@ -125,8 +149,8 @@ class YouTubeConverter(DocumentConverter):
                    # Alternative formatting:
                    # formatter = TextFormatter()
                    # formatter.format_transcript(transcript)
-                except Exception:
+                except Exception as e:
-                    pass
+                    print(f"Error fetching transcript: {e}")
            if transcript_text:
                webpage_text += f"\n### Transcript\n{transcript_text}\n"
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@ -184,9 +184,9 @@ def test_markitdown_remote() -> None:
    # Youtube
    # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
-    # result = markitdown.convert(YOUTUBE_TEST_URL)
+    result = markitdown.convert(YOUTUBE_TEST_URL)
-    # for test_string in YOUTUBE_TEST_STRINGS:
+    for test_string in YOUTUBE_TEST_STRINGS:
-    #     assert test_string in result.text_content
+        assert test_string in result.text_content
 def test_markitdown_local() -> None: