File size: 8,233 Bytes
8ad6537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import re
from smolagents import tool
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound


def _extract_video_id(youtube_url: str) -> str | None:
    """
    Extracts the YouTube video ID from a URL.
    Handles standard, shortened, and embed URLs.
    """
    # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
    match = re.search(r"watch\?v=([^&]+)", youtube_url)
    if match:
        return match.group(1)

    # Shortened URL: https://youtu.be/VIDEO_ID
    match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
    if match:
        return match.group(1)

    # Embed URL: https://www.youtube.com/embed/VIDEO_ID
    match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
    if match:
        return match.group(1)

    # Video ID directly passed
    # Basic check for a valid video ID format
    if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
        return youtube_url

    return None


@tool
def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str:
    """
    Retrieves the transcript for a given YouTube video URL or video ID.
    It tries to fetch the transcript in the preferred languages first (defaulting to English).

    Args:
        video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID") 
                               or just the 11-character video ID.
        lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference.
                                     Defaults to ['en', 'en-US', 'en-GB'].

    Returns:
        str: The concatenated transcript text if successful.
             An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID).
    """
    video_id = _extract_video_id(video_url_or_id)

    if not video_id:
        return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID."

    try:
        # Fetch available transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

        # Try to find transcript in preferred languages
        transcript = None
        for lang_code in lang_preference:
            try:
                transcript = transcript_list.find_transcript([lang_code])
                break
            except NoTranscriptFound:
                continue

        # If not found in preferred, try generated transcript in preferred languages
        if not transcript:
            for lang_code in lang_preference:
                try:
                    transcript = transcript_list.find_generated_transcript([
                                                                           lang_code])
                    break
                except NoTranscriptFound:
                    continue

        # If still not found, try any available English transcript
        if not transcript:
            try:
                transcript = transcript_list.find_transcript(
                    ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
            except NoTranscriptFound:
                pass  # Continue to try any generated English transcript

        if not transcript:
            try:
                transcript = transcript_list.find_generated_transcript(
                    ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
            except NoTranscriptFound:
                # If no English transcript, grab the first available original language transcript
                try:
                    print(
                        f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.")
                    original_lang_transcript = next(
                        iter(transcript_list))  # get the first one
                    transcript = original_lang_transcript
                except StopIteration:  # No transcripts at all
                    return f"Error: No transcripts at all seem to be available for video ID '{video_id}'."
                except NoTranscriptFound:  # Should be caught by StopIteration if list is empty
                    return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages."

        if transcript:
            full_transcript_data = transcript.fetch()
            # Concatenate all text segments into a single string
            transcript_text = " ".join([segment['text']
                                       for segment in full_transcript_data])
            return transcript_text
        else:
            # This case should ideally be covered by the fallbacks above
            return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English."

    except TranscriptsDisabled:
        return f"Error: Transcripts are disabled for video ID '{video_id}'."
    # This might catch cases where video ID is valid but has zero transcripts at all.
    except NoTranscriptFound:
        return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
    except Exception as e:
        # Catch any other unexpected errors from the API or video ID issues not caught by regex
        error_type = type(e).__name__
        # Check for common youtube_transcript_api specific errors not explicitly caught if any
        # Heuristic for bad ID
        if "video ID" in str(e).lower() or "parameter" in str(e).lower():
            return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})"
        return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"


if __name__ == '__main__':
    # Example Usage:
    test_video_url = "https://www.youtube.com/watch?v=1htKBjuUWec"  # Stargate SG-1 Clip
    # Rick Astley (often has transcripts disabled or removed for fun)
    test_video_url_no_transcript = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
    test_video_id_only = "1htKBjuUWec"
    test_video_short_url = "https://youtu.be/1htKBjuUWec"
    test_video_invalid_id = "invalidID123"

    print(f"--- Testing with URL: {test_video_url} ---")
    transcript = get_youtube_video_transcript(test_video_url)
    if not transcript.startswith("Error:"):
        print(f"Transcript (first 300 chars): {transcript[:300]}...")
    else:
        print(transcript)

    print(
        f"\n--- Testing with Rick Astley URL: {test_video_url_no_transcript} ---")
    transcript_rick = get_youtube_video_transcript(
        test_video_url_no_transcript)
    print(transcript_rick)  # Expected to often fail or have limited transcripts

    print(f"\n--- Testing with Video ID only: {test_video_id_only} ---")
    transcript_id = get_youtube_video_transcript(test_video_id_only)
    if not transcript_id.startswith("Error:"):
        print(f"Transcript (first 300 chars): {transcript_id[:300]}...")
    else:
        print(transcript_id)

    print(f"\n--- Testing with Short URL: {test_video_short_url} ---")
    transcript_short = get_youtube_video_transcript(test_video_short_url)
    if not transcript_short.startswith("Error:"):
        print(f"Transcript (first 300 chars): {transcript_short[:300]}...")
    else:
        print(transcript_short)

    print(f"\n--- Testing with Invalid Video ID: {test_video_invalid_id} ---")
    transcript_invalid = get_youtube_video_transcript(test_video_invalid_id)
    print(transcript_invalid)

    # Example of a video that might only have auto-generated transcripts or transcripts in other languages
    test_video_non_english_primary = "https://www.youtube.com/watch?v=xU7rVbnefj0"
    print(
        f"\n--- Testing with non-English primary video: {test_video_non_english_primary} ---")
    transcript_non_eng = get_youtube_video_transcript(
        test_video_non_english_primary)
    if not transcript_non_eng.startswith("Error:"):
        print(f"Transcript (first 300 chars): {transcript_non_eng[:300]}...")
    else:
        print(transcript_non_eng)