Merge branch 'main' into feat-optional_b64

This commit is contained in:
Yuzhong Zhang 2025-03-21 00:50:17 +08:00 committed by GitHub
commit 887dbbcf5c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 19 additions and 28 deletions

View file

@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"markdownify", "markdownify",
"magika>=0.6.1rc3", "magika~=0.6.1",
"charset-normalizer", "charset-normalizer",
] ]
@ -42,7 +42,7 @@ all = [
"olefile", "olefile",
"pydub", "pydub",
"SpeechRecognition", "SpeechRecognition",
"youtube-transcript-api", "youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence", "azure-ai-documentintelligence",
"azure-identity" "azure-identity"
] ]

View file

@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.1.0a4" __version__ = "0.1.0a5"

View file

@ -7,19 +7,13 @@ from .._exceptions import MissingDependencyException
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
_dependency_exc_info = None _dependency_exc_info = None
try: try:
# Suppress some deprecation warnings from the speech_recognition library # Suppress some warnings on library import
import warnings import warnings
warnings.filterwarnings( with warnings.catch_warnings():
"ignore", category=DeprecationWarning, module="speech_recognition" warnings.filterwarnings("ignore", category=DeprecationWarning)
) warnings.filterwarnings("ignore", category=SyntaxWarning)
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="pydub", # TODO: Migrate away from pydub
)
import speech_recognition as sr import speech_recognition as sr
import pydub import pydub
except ImportError: except ImportError:
# Preserve the error and stack trace for later # Preserve the error and stack trace for later

View file

@ -4,21 +4,20 @@ import time
import io import io
import re import re
import bs4 import bs4
import warnings
from typing import Any, BinaryIO, Optional, Dict, List, Union from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote from urllib.parse import parse_qs, urlparse, unquote
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
warnings.filterwarnings( # Suppress some warnings on library import
"ignore", import warnings
category=SyntaxWarning,
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api with warnings.catch_warnings():
) warnings.filterwarnings("ignore", category=SyntaxWarning)
# Patch submitted upstream to fix the SyntaxWarning
from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
webpage_text += f"\n### Description\n{description}\n" webpage_text += f"\n### Description\n{description}\n"
if IS_YOUTUBE_TRANSCRIPT_CAPABLE: if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = "" transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore params = parse_qs(parsed_url.query) # type: ignore
@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
) )
# Retry the transcript fetching operation # Retry the transcript fetching operation
transcript = self._retry_operation( transcript = self._retry_operation(
lambda: YouTubeTranscriptApi.get_transcript( lambda: ytt_api.fetch(
video_id, languages=youtube_transcript_languages video_id, languages=youtube_transcript_languages
), ),
retries=3, # Retry 3 times retries=3, # Retry 3 times
@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
) )
if transcript: if transcript:
transcript_text = " ".join( transcript_text = " ".join(
[part["text"] for part in transcript] [part.text for part in transcript]
) # type: ignore ) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
# formatter.format_transcript(transcript)
except Exception as e: except Exception as e:
print(f"Error fetching transcript: {e}") print(f"Error fetching transcript: {e}")
if transcript_text: if transcript_text: