Merge branch 'main' into feat-optional_b64

2025-03-21 00:50:17 +08:00 · 2025-03-21 00:50:17 +08:00 · 887dbbcf5c
commit 887dbbcf5c
parent e952ab1189 cd6aa41361
4 changed files with 19 additions and 28 deletions
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@ -27,7 +27,7 @@ dependencies = [
  "beautifulsoup4",
  "requests",
  "markdownify",
-  "magika>=0.6.1rc3",
+  "magika~=0.6.1",
  "charset-normalizer",
 ]
@ -42,7 +42,7 @@ all = [
  "olefile",
  "pydub",
  "SpeechRecognition",
-  "youtube-transcript-api",
+  "youtube-transcript-api~=1.0.0",
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a4"
+__version__ = "0.1.0a5"
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@ -7,19 +7,13 @@ from .._exceptions import MissingDependencyException
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
-    # Suppress some deprecation warnings from the speech_recognition library
+    # Suppress some warnings on library import
    import warnings
-    warnings.filterwarnings(
+    with warnings.catch_warnings():
-        "ignore", category=DeprecationWarning, module="speech_recognition"
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
-    )
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
    warnings.filterwarnings(
        "ignore",
        category=SyntaxWarning,
        module="pydub",  # TODO: Migrate away from pydub
    )
        import speech_recognition as sr
        import pydub
 except ImportError:
    # Preserve the error and stack trace for later
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@ -4,21 +4,20 @@ import time
 import io
 import re
 import bs4
 import warnings
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify
 # Optional YouTube transcription support
 try:
-    warnings.filterwarnings(
+    # Suppress some warnings on library import
-        "ignore",
+    import warnings
-        category=SyntaxWarning,
+
-        module="youtube_transcript_api",  # Patch submitted to youtube-transcript-api
+    with warnings.catch_warnings():
-    )
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
        # Patch submitted upstream to fix the SyntaxWarning
        from youtube_transcript_api import YouTubeTranscriptApi
    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
            webpage_text += f"\n### Description\n{description}\n"
        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
            ytt_api = YouTubeTranscriptApi()
            transcript_text = ""
            parsed_url = urlparse(stream_info.url)  # type: ignore
            params = parse_qs(parsed_url.query)  # type: ignore
@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
                    )
                    # Retry the transcript fetching operation
                    transcript = self._retry_operation(
-                        lambda: YouTubeTranscriptApi.get_transcript(
+                        lambda: ytt_api.fetch(
                            video_id, languages=youtube_transcript_languages
                        ),
                        retries=3,  # Retry 3 times
@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
                    )
                    if transcript:
                        transcript_text = " ".join(
-                            [part["text"] for part in transcript]
+                            [part.text for part in transcript]
                        )  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
                    # formatter.format_transcript(transcript)
                except Exception as e:
                    print(f"Error fetching transcript: {e}")
            if transcript_text: