fix: improve YouTube transcript extraction reliability

This commit is contained in:
Nima 2025-02-18 19:32:19 +01:00
parent 8363f419ab
commit f712b63bf3

View file

@ -108,8 +108,7 @@ class YouTubeConverter(DocumentConverter):
transcript_text = "" transcript_text = ""
parsed_url = urlparse(url) # type: ignore parsed_url = urlparse(url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore params = parse_qs(parsed_url.query) # type: ignore
if "v" in params: if "v" in params and params["v"][0]:
assert isinstance(params["v"][0], str)
video_id = str(params["v"][0]) video_id = str(params["v"][0])
try: try:
youtube_transcript_languages = kwargs.get( youtube_transcript_languages = kwargs.get(
@ -119,7 +118,10 @@ class YouTubeConverter(DocumentConverter):
transcript = YouTubeTranscriptApi.get_transcript( transcript = YouTubeTranscriptApi.get_transcript(
video_id, languages=youtube_transcript_languages video_id, languages=youtube_transcript_languages
) # type: ignore ) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore if transcript:
transcript_text = " ".join(
[part["text"] for part in transcript]
) # type: ignore
# Alternative formatting: # Alternative formatting:
# formatter = TextFormatter() # formatter = TextFormatter()
# formatter.format_transcript(transcript) # formatter.format_transcript(transcript)