From cc36fe9f0b1d8911a86d7452af5a601e0793789d Mon Sep 17 00:00:00 2001 From: Nima Date: Tue, 18 Feb 2025 20:20:26 +0100 Subject: [PATCH] fix: implement retry logic for YouTube transcript fetching and fix URL decoding issue --- .../converters/_youtube_converter.py | 38 +++++++++++++++---- packages/markitdown/tests/test_markitdown.py | 6 +-- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index 9b69802..e61b208 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -1,5 +1,7 @@ import re import json +import urllib.parse +import time from typing import Any, Union, Dict, List from urllib.parse import parse_qs, urlparse @@ -25,6 +27,20 @@ class YouTubeConverter(DocumentConverter): ): super().__init__(priority=priority) + def retry_operation(self, operation, retries=3, delay=2): + """Retries the operation if it fails.""" + attempt = 0 + while attempt < retries: + try: + return operation() # Attempt the operation + except Exception as e: + print(f"Attempt {attempt + 1} failed: {e}") + if attempt < retries - 1: + time.sleep(delay) # Wait before retrying + attempt += 1 + # If all attempts fail, raise the last exception + raise Exception(f"Operation failed after {retries} attempts.") + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: @@ -33,6 +49,10 @@ class YouTubeConverter(DocumentConverter): if extension.lower() not in [".html", ".htm"]: return None url = kwargs.get("url", "") + + url = urllib.parse.unquote(url) + url = url.replace(r"\?", "?").replace(r"\=", "=") + if not url.startswith("https://www.youtube.com/watch?"): return None @@ -57,7 +77,7 @@ class YouTubeConverter(DocumentConverter): metadata[meta[a]] = content break - # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation + # Try reading the description try: for script in soup(["script"]): if not script.string: # Skip empty scripts @@ -114,10 +134,14 @@ class YouTubeConverter(DocumentConverter): youtube_transcript_languages = kwargs.get( "youtube_transcript_languages", ("en",) ) - # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript( - video_id, languages=youtube_transcript_languages - ) # type: ignore + # Retry the transcript fetching operation + transcript = self.retry_operation( + lambda: YouTubeTranscriptApi.get_transcript( + video_id, languages=youtube_transcript_languages + ), + retries=3, # Retry 3 times + delay=2, # 2 seconds delay between retries + ) if transcript: transcript_text = " ".join( [part["text"] for part in transcript] @@ -125,8 +149,8 @@ class YouTubeConverter(DocumentConverter): # Alternative formatting: # formatter = TextFormatter() # formatter.format_transcript(transcript) - except Exception: - pass + except Exception as e: + print(f"Error fetching transcript: {e}") if transcript_text: webpage_text += f"\n### Transcript\n{transcript_text}\n" diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index efd45ac..55afcc3 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -184,9 +184,9 @@ def test_markitdown_remote() -> None: # Youtube # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue - # result = markitdown.convert(YOUTUBE_TEST_URL) - # for test_string in YOUTUBE_TEST_STRINGS: - # assert test_string in result.text_content + result = markitdown.convert(YOUTUBE_TEST_URL) + for test_string in YOUTUBE_TEST_STRINGS: + assert test_string in result.text_content def test_markitdown_local() -> None: