Support specifying YouTube transcript language
This commit is contained in:
parent
81e3f24acd
commit
695100d5d8
1 changed files with 5 additions and 2 deletions
|
|
@ -344,8 +344,11 @@ class YouTubeConverter(DocumentConverter):
|
||||||
assert isinstance(params["v"][0], str)
|
assert isinstance(params["v"][0], str)
|
||||||
video_id = str(params["v"][0])
|
video_id = str(params["v"][0])
|
||||||
try:
|
try:
|
||||||
|
youtube_transcript_languages = kwargs.get(
|
||||||
|
"youtube_transcript_languages", ("en",)
|
||||||
|
)
|
||||||
# Must be a single transcript.
|
# Must be a single transcript.
|
||||||
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
|
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
|
||||||
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
||||||
# Alternative formatting:
|
# Alternative formatting:
|
||||||
# formatter = TextFormatter()
|
# formatter = TextFormatter()
|
||||||
|
|
@ -1003,7 +1006,7 @@ class MarkItDown:
|
||||||
self._append_ext(extensions, g)
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
result = self._convert(temp_path, extensions, url=response.url)
|
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
||||||
# Clean up
|
# Clean up
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue