Updated versions of magika and youtube-transcript-api

This commit is contained in:
Adam Fourney 2025-03-19 22:06:02 -07:00
parent e2eb82be7c
commit a8c76a01cf
2 changed files with 12 additions and 15 deletions

View file

@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"markdownify", "markdownify",
"magika>=0.6.1rc3", "magika~=0.6.1",
"charset-normalizer", "charset-normalizer",
] ]
@ -42,7 +42,7 @@ all = [
"olefile", "olefile",
"pydub", "pydub",
"SpeechRecognition", "SpeechRecognition",
"youtube-transcript-api", "youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence", "azure-ai-documentintelligence",
"azure-identity" "azure-identity"
] ]

View file

@ -4,22 +4,21 @@ import time
import io import io
import re import re
import bs4 import bs4
import warnings
from typing import Any, BinaryIO, Optional, Dict, List, Union from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote from urllib.parse import parse_qs, urlparse, unquote
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
warnings.filterwarnings( # Suppress some warnings on library import
"ignore", import warnings
category=SyntaxWarning,
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api with warnings.catch_warnings():
) warnings.filterwarnings("ignore", category=SyntaxWarning)
from youtube_transcript_api import YouTubeTranscriptApi # Patch submitted upstream to fix the SyntaxWarning
from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
webpage_text += f"\n### Description\n{description}\n" webpage_text += f"\n### Description\n{description}\n"
if IS_YOUTUBE_TRANSCRIPT_CAPABLE: if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = "" transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore params = parse_qs(parsed_url.query) # type: ignore
@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
) )
# Retry the transcript fetching operation # Retry the transcript fetching operation
transcript = self._retry_operation( transcript = self._retry_operation(
lambda: YouTubeTranscriptApi.get_transcript( lambda: ytt_api.fetch(
video_id, languages=youtube_transcript_languages video_id, languages=youtube_transcript_languages
), ),
retries=3, # Retry 3 times retries=3, # Retry 3 times
@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
) )
if transcript: if transcript:
transcript_text = " ".join( transcript_text = " ".join(
[part["text"] for part in transcript] [part.text for part in transcript]
) # type: ignore ) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
# formatter.format_transcript(transcript)
except Exception as e: except Exception as e:
print(f"Error fetching transcript: {e}") print(f"Error fetching transcript: {e}")
if transcript_text: if transcript_text: