Spaces:
Sleeping
Sleeping
| import re | |
| from smolagents import tool | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
| def _extract_video_id(youtube_url: str) -> str | None: | |
| """ | |
| Extracts the YouTube video ID from a URL. | |
| Handles standard, shortened, and embed URLs. | |
| """ | |
| # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID | |
| match = re.search(r"watch\?v=([^&]+)", youtube_url) | |
| if match: | |
| return match.group(1) | |
| # Shortened URL: https://youtu.be/VIDEO_ID | |
| match = re.search(r"youtu\.be/([^?&]+)", youtube_url) | |
| if match: | |
| return match.group(1) | |
| # Embed URL: https://www.youtube.com/embed/VIDEO_ID | |
| match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url) | |
| if match: | |
| return match.group(1) | |
| # Video ID directly passed | |
| # Basic check for a valid video ID format | |
| if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url): | |
| return youtube_url | |
| return None | |
| def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str: | |
| """ | |
| Retrieves the transcript for a given YouTube video URL or video ID. | |
| It tries to fetch the transcript in the preferred languages first (defaulting to English). | |
| Args: | |
| video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID") | |
| or just the 11-character video ID. | |
| lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference. | |
| Defaults to ['en', 'en-US', 'en-GB']. | |
| Returns: | |
| str: The concatenated transcript text if successful. | |
| An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID). | |
| """ | |
| video_id = _extract_video_id(video_url_or_id) | |
| if not video_id: | |
| return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID." | |
| try: | |
| # Fetch available transcripts | |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
| # Try to find transcript in preferred languages | |
| transcript = None | |
| for lang_code in lang_preference: | |
| try: | |
| transcript = transcript_list.find_transcript([lang_code]) | |
| break | |
| except NoTranscriptFound: | |
| continue | |
| # If not found in preferred, try generated transcript in preferred languages | |
| if not transcript: | |
| for lang_code in lang_preference: | |
| try: | |
| transcript = transcript_list.find_generated_transcript([ | |
| lang_code]) | |
| break | |
| except NoTranscriptFound: | |
| continue | |
| # If still not found, try any available English transcript | |
| if not transcript: | |
| try: | |
| transcript = transcript_list.find_transcript( | |
| ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN']) | |
| except NoTranscriptFound: | |
| pass # Continue to try any generated English transcript | |
| if not transcript: | |
| try: | |
| transcript = transcript_list.find_generated_transcript( | |
| ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN']) | |
| except NoTranscriptFound: | |
| # If no English transcript, grab the first available original language transcript | |
| try: | |
| print( | |
| f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.") | |
| original_lang_transcript = next( | |
| iter(transcript_list)) # get the first one | |
| transcript = original_lang_transcript | |
| except StopIteration: # No transcripts at all | |
| return f"Error: No transcripts at all seem to be available for video ID '{video_id}'." | |
| except NoTranscriptFound: # Should be caught by StopIteration if list is empty | |
| return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages." | |
| if transcript: | |
| full_transcript_data = transcript.fetch() | |
| # Concatenate all text segments into a single string | |
| transcript_text = " ".join([segment['text'] | |
| for segment in full_transcript_data]) | |
| return transcript_text | |
| else: | |
| # This case should ideally be covered by the fallbacks above | |
| return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English." | |
| except TranscriptsDisabled: | |
| return f"Error: Transcripts are disabled for video ID '{video_id}'." | |
| # This might catch cases where video ID is valid but has zero transcripts at all. | |
| except NoTranscriptFound: | |
| return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts." | |
| except Exception as e: | |
| # Catch any other unexpected errors from the API or video ID issues not caught by regex | |
| error_type = type(e).__name__ | |
| # Check for common youtube_transcript_api specific errors not explicitly caught if any | |
| # Heuristic for bad ID | |
| if "video ID" in str(e).lower() or "parameter" in str(e).lower(): | |
| return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})" | |
| return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}" | |
| if __name__ == '__main__': | |
| # Example Usage: | |
| test_video_url = "https://www.youtube.com/watch?v=1htKBjuUWec" # Stargate SG-1 Clip | |
| # Rick Astley (often has transcripts disabled or removed for fun) | |
| test_video_url_no_transcript = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" | |
| test_video_id_only = "1htKBjuUWec" | |
| test_video_short_url = "https://youtu.be/1htKBjuUWec" | |
| test_video_invalid_id = "invalidID123" | |
| print(f"--- Testing with URL: {test_video_url} ---") | |
| transcript = get_youtube_video_transcript(test_video_url) | |
| if not transcript.startswith("Error:"): | |
| print(f"Transcript (first 300 chars): {transcript[:300]}...") | |
| else: | |
| print(transcript) | |
| print( | |
| f"\n--- Testing with Rick Astley URL: {test_video_url_no_transcript} ---") | |
| transcript_rick = get_youtube_video_transcript( | |
| test_video_url_no_transcript) | |
| print(transcript_rick) # Expected to often fail or have limited transcripts | |
| print(f"\n--- Testing with Video ID only: {test_video_id_only} ---") | |
| transcript_id = get_youtube_video_transcript(test_video_id_only) | |
| if not transcript_id.startswith("Error:"): | |
| print(f"Transcript (first 300 chars): {transcript_id[:300]}...") | |
| else: | |
| print(transcript_id) | |
| print(f"\n--- Testing with Short URL: {test_video_short_url} ---") | |
| transcript_short = get_youtube_video_transcript(test_video_short_url) | |
| if not transcript_short.startswith("Error:"): | |
| print(f"Transcript (first 300 chars): {transcript_short[:300]}...") | |
| else: | |
| print(transcript_short) | |
| print(f"\n--- Testing with Invalid Video ID: {test_video_invalid_id} ---") | |
| transcript_invalid = get_youtube_video_transcript(test_video_invalid_id) | |
| print(transcript_invalid) | |
| # Example of a video that might only have auto-generated transcripts or transcripts in other languages | |
| test_video_non_english_primary = "https://www.youtube.com/watch?v=xU7rVbnefj0" | |
| print( | |
| f"\n--- Testing with non-English primary video: {test_video_non_english_primary} ---") | |
| transcript_non_eng = get_youtube_video_transcript( | |
| test_video_non_english_primary) | |
| if not transcript_non_eng.startswith("Error:"): | |
| print(f"Transcript (first 300 chars): {transcript_non_eng[:300]}...") | |
| else: | |
| print(transcript_non_eng) | |