fix: implement retry logic for YouTube transcript fetching and fix URL decoding issue
This commit is contained in:
parent
f712b63bf3
commit
cc36fe9f0b
2 changed files with 34 additions and 10 deletions
|
|
@ -1,5 +1,7 @@
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import urllib.parse
|
||||||
|
import time
|
||||||
|
|
||||||
from typing import Any, Union, Dict, List
|
from typing import Any, Union, Dict, List
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
|
@ -25,6 +27,20 @@ class YouTubeConverter(DocumentConverter):
|
||||||
):
|
):
|
||||||
super().__init__(priority=priority)
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
|
def retry_operation(self, operation, retries=3, delay=2):
|
||||||
|
"""Retries the operation if it fails."""
|
||||||
|
attempt = 0
|
||||||
|
while attempt < retries:
|
||||||
|
try:
|
||||||
|
return operation() # Attempt the operation
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Attempt {attempt + 1} failed: {e}")
|
||||||
|
if attempt < retries - 1:
|
||||||
|
time.sleep(delay) # Wait before retrying
|
||||||
|
attempt += 1
|
||||||
|
# If all attempts fail, raise the last exception
|
||||||
|
raise Exception(f"Operation failed after {retries} attempts.")
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|
@ -33,6 +49,10 @@ class YouTubeConverter(DocumentConverter):
|
||||||
if extension.lower() not in [".html", ".htm"]:
|
if extension.lower() not in [".html", ".htm"]:
|
||||||
return None
|
return None
|
||||||
url = kwargs.get("url", "")
|
url = kwargs.get("url", "")
|
||||||
|
|
||||||
|
url = urllib.parse.unquote(url)
|
||||||
|
url = url.replace(r"\?", "?").replace(r"\=", "=")
|
||||||
|
|
||||||
if not url.startswith("https://www.youtube.com/watch?"):
|
if not url.startswith("https://www.youtube.com/watch?"):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -57,7 +77,7 @@ class YouTubeConverter(DocumentConverter):
|
||||||
metadata[meta[a]] = content
|
metadata[meta[a]] = content
|
||||||
break
|
break
|
||||||
|
|
||||||
# We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
|
# Try reading the description
|
||||||
try:
|
try:
|
||||||
for script in soup(["script"]):
|
for script in soup(["script"]):
|
||||||
if not script.string: # Skip empty scripts
|
if not script.string: # Skip empty scripts
|
||||||
|
|
@ -114,10 +134,14 @@ class YouTubeConverter(DocumentConverter):
|
||||||
youtube_transcript_languages = kwargs.get(
|
youtube_transcript_languages = kwargs.get(
|
||||||
"youtube_transcript_languages", ("en",)
|
"youtube_transcript_languages", ("en",)
|
||||||
)
|
)
|
||||||
# Must be a single transcript.
|
# Retry the transcript fetching operation
|
||||||
transcript = YouTubeTranscriptApi.get_transcript(
|
transcript = self.retry_operation(
|
||||||
video_id, languages=youtube_transcript_languages
|
lambda: YouTubeTranscriptApi.get_transcript(
|
||||||
) # type: ignore
|
video_id, languages=youtube_transcript_languages
|
||||||
|
),
|
||||||
|
retries=3, # Retry 3 times
|
||||||
|
delay=2, # 2 seconds delay between retries
|
||||||
|
)
|
||||||
if transcript:
|
if transcript:
|
||||||
transcript_text = " ".join(
|
transcript_text = " ".join(
|
||||||
[part["text"] for part in transcript]
|
[part["text"] for part in transcript]
|
||||||
|
|
@ -125,8 +149,8 @@ class YouTubeConverter(DocumentConverter):
|
||||||
# Alternative formatting:
|
# Alternative formatting:
|
||||||
# formatter = TextFormatter()
|
# formatter = TextFormatter()
|
||||||
# formatter.format_transcript(transcript)
|
# formatter.format_transcript(transcript)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
pass
|
print(f"Error fetching transcript: {e}")
|
||||||
if transcript_text:
|
if transcript_text:
|
||||||
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -184,9 +184,9 @@ def test_markitdown_remote() -> None:
|
||||||
|
|
||||||
# Youtube
|
# Youtube
|
||||||
# TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
|
# TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
|
||||||
# result = markitdown.convert(YOUTUBE_TEST_URL)
|
result = markitdown.convert(YOUTUBE_TEST_URL)
|
||||||
# for test_string in YOUTUBE_TEST_STRINGS:
|
for test_string in YOUTUBE_TEST_STRINGS:
|
||||||
# assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_markitdown_local() -> None:
|
def test_markitdown_local() -> None:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue