fix: improve YouTube transcript extraction reliability
This commit is contained in:
parent
8363f419ab
commit
f712b63bf3
1 changed files with 5 additions and 3 deletions
|
|
@ -108,8 +108,7 @@ class YouTubeConverter(DocumentConverter):
|
||||||
transcript_text = ""
|
transcript_text = ""
|
||||||
parsed_url = urlparse(url) # type: ignore
|
parsed_url = urlparse(url) # type: ignore
|
||||||
params = parse_qs(parsed_url.query) # type: ignore
|
params = parse_qs(parsed_url.query) # type: ignore
|
||||||
if "v" in params:
|
if "v" in params and params["v"][0]:
|
||||||
assert isinstance(params["v"][0], str)
|
|
||||||
video_id = str(params["v"][0])
|
video_id = str(params["v"][0])
|
||||||
try:
|
try:
|
||||||
youtube_transcript_languages = kwargs.get(
|
youtube_transcript_languages = kwargs.get(
|
||||||
|
|
@ -119,7 +118,10 @@ class YouTubeConverter(DocumentConverter):
|
||||||
transcript = YouTubeTranscriptApi.get_transcript(
|
transcript = YouTubeTranscriptApi.get_transcript(
|
||||||
video_id, languages=youtube_transcript_languages
|
video_id, languages=youtube_transcript_languages
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
if transcript:
|
||||||
|
transcript_text = " ".join(
|
||||||
|
[part["text"] for part in transcript]
|
||||||
|
) # type: ignore
|
||||||
# Alternative formatting:
|
# Alternative formatting:
|
||||||
# formatter = TextFormatter()
|
# formatter = TextFormatter()
|
||||||
# formatter.format_transcript(transcript)
|
# formatter.format_transcript(transcript)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue