Updated versions of magika and youtube-transcript-api
This commit is contained in:
parent
e2eb82be7c
commit
a8c76a01cf
2 changed files with 12 additions and 15 deletions
|
|
@ -27,7 +27,7 @@ dependencies = [
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"requests",
|
"requests",
|
||||||
"markdownify",
|
"markdownify",
|
||||||
"magika>=0.6.1rc3",
|
"magika~=0.6.1",
|
||||||
"charset-normalizer",
|
"charset-normalizer",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -42,7 +42,7 @@ all = [
|
||||||
"olefile",
|
"olefile",
|
||||||
"pydub",
|
"pydub",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
"youtube-transcript-api",
|
"youtube-transcript-api~=1.0.0",
|
||||||
"azure-ai-documentintelligence",
|
"azure-ai-documentintelligence",
|
||||||
"azure-identity"
|
"azure-identity"
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -4,22 +4,21 @@ import time
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import bs4
|
import bs4
|
||||||
import warnings
|
|
||||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
||||||
from urllib.parse import parse_qs, urlparse, unquote
|
from urllib.parse import parse_qs, urlparse, unquote
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from ._markdownify import _CustomMarkdownify
|
|
||||||
|
|
||||||
# Optional YouTube transcription support
|
# Optional YouTube transcription support
|
||||||
try:
|
try:
|
||||||
warnings.filterwarnings(
|
# Suppress some warnings on library import
|
||||||
"ignore",
|
import warnings
|
||||||
category=SyntaxWarning,
|
|
||||||
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api
|
with warnings.catch_warnings():
|
||||||
)
|
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||||
from youtube_transcript_api import YouTubeTranscriptApi
|
# Patch submitted upstream to fix the SyntaxWarning
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
|
||||||
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
|
|
@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
|
||||||
webpage_text += f"\n### Description\n{description}\n"
|
webpage_text += f"\n### Description\n{description}\n"
|
||||||
|
|
||||||
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
|
||||||
|
ytt_api = YouTubeTranscriptApi()
|
||||||
transcript_text = ""
|
transcript_text = ""
|
||||||
parsed_url = urlparse(stream_info.url) # type: ignore
|
parsed_url = urlparse(stream_info.url) # type: ignore
|
||||||
params = parse_qs(parsed_url.query) # type: ignore
|
params = parse_qs(parsed_url.query) # type: ignore
|
||||||
|
|
@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
# Retry the transcript fetching operation
|
# Retry the transcript fetching operation
|
||||||
transcript = self._retry_operation(
|
transcript = self._retry_operation(
|
||||||
lambda: YouTubeTranscriptApi.get_transcript(
|
lambda: ytt_api.fetch(
|
||||||
video_id, languages=youtube_transcript_languages
|
video_id, languages=youtube_transcript_languages
|
||||||
),
|
),
|
||||||
retries=3, # Retry 3 times
|
retries=3, # Retry 3 times
|
||||||
|
|
@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
|
||||||
)
|
)
|
||||||
if transcript:
|
if transcript:
|
||||||
transcript_text = " ".join(
|
transcript_text = " ".join(
|
||||||
[part["text"] for part in transcript]
|
[part.text for part in transcript]
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
# Alternative formatting:
|
|
||||||
# formatter = TextFormatter()
|
|
||||||
# formatter.format_transcript(transcript)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error fetching transcript: {e}")
|
print(f"Error fetching transcript: {e}")
|
||||||
if transcript_text:
|
if transcript_text:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue