agent-course-gaia

Sleeping

App Files Files Community

agent-course-gaia / youtube_tool.py

kirbah

Add YouTube transcript retrieval functionality and update requirements

8ad6537 6 months ago

raw

history blame contribute delete

8.23 kB

	import re
	from smolagents import tool
	from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound


	def _extract_video_id(youtube_url: str) -> str \| None:
	"""
	Extracts the YouTube video ID from a URL.
	Handles standard, shortened, and embed URLs.
	"""
	# Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
	match = re.search(r"watch\?v=([^&]+)", youtube_url)
	if match:
	return match.group(1)

	# Shortened URL: https://youtu.be/VIDEO_ID
	match = re.search(r"youtu\.be/([^?&]+)", youtube_url)
	if match:
	return match.group(1)

	# Embed URL: https://www.youtube.com/embed/VIDEO_ID
	match = re.search(r"youtube\.com/embed/([^?&]+)", youtube_url)
	if match:
	return match.group(1)

	# Video ID directly passed
	# Basic check for a valid video ID format
	if re.fullmatch(r"^[a-zA-Z0-9_-]{11}$", youtube_url):
	return youtube_url

	return None


	@tool
	def get_youtube_video_transcript(video_url_or_id: str, lang_preference: list[str] = ['en', 'en-US', 'en-GB']) -> str:
	"""
	Retrieves the transcript for a given YouTube video URL or video ID.
	It tries to fetch the transcript in the preferred languages first (defaulting to English).

	Args:
	video_url_or_id (str): The full YouTube video URL (e.g., "https://www.youtube.com/watch?v=VIDEO_ID")
	or just the 11-character video ID.
	lang_preference (list[str]): A list of language codes to try for the transcript, in order of preference.
	Defaults to ['en', 'en-US', 'en-GB'].

	Returns:
	str: The concatenated transcript text if successful.
	An error message string if the transcript cannot be fetched (e.g., disabled, not found, invalid ID).
	"""
	video_id = _extract_video_id(video_url_or_id)

	if not video_id:
	return f"Error: Invalid YouTube video URL or ID provided: '{video_url_or_id}'. Could not extract a valid video ID."

	try:
	# Fetch available transcripts
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

	# Try to find transcript in preferred languages
	transcript = None
	for lang_code in lang_preference:
	try:
	transcript = transcript_list.find_transcript([lang_code])
	break
	except NoTranscriptFound:
	continue

	# If not found in preferred, try generated transcript in preferred languages
	if not transcript:
	for lang_code in lang_preference:
	try:
	transcript = transcript_list.find_generated_transcript([
	lang_code])
	break
	except NoTranscriptFound:
	continue

	# If still not found, try any available English transcript
	if not transcript:
	try:
	transcript = transcript_list.find_transcript(
	['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
	except NoTranscriptFound:
	pass # Continue to try any generated English transcript

	if not transcript:
	try:
	transcript = transcript_list.find_generated_transcript(
	['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-IN'])
	except NoTranscriptFound:
	# If no English transcript, grab the first available original language transcript
	try:
	print(
	f"YouTubeTool: No English transcript found for {video_id}. Trying first available original language.")
	original_lang_transcript = next(
	iter(transcript_list)) # get the first one
	transcript = original_lang_transcript
	except StopIteration: # No transcripts at all
	return f"Error: No transcripts at all seem to be available for video ID '{video_id}'."
	except NoTranscriptFound: # Should be caught by StopIteration if list is empty
	return f"Error: No transcripts found for video ID '{video_id}' after trying preferred and English languages."

	if transcript:
	full_transcript_data = transcript.fetch()
	# Concatenate all text segments into a single string
	transcript_text = " ".join([segment['text']
	for segment in full_transcript_data])
	return transcript_text
	else:
	# This case should ideally be covered by the fallbacks above
	return f"Error: Could not find a suitable transcript for video ID '{video_id}' in languages: {lang_preference} or English."

	except TranscriptsDisabled:
	return f"Error: Transcripts are disabled for video ID '{video_id}'."
	# This might catch cases where video ID is valid but has zero transcripts at all.
	except NoTranscriptFound:
	return f"Error: No transcripts whatsoever could be found for video ID '{video_id}'. The video might not have any captions or transcripts."
	except Exception as e:
	# Catch any other unexpected errors from the API or video ID issues not caught by regex
	error_type = type(e).__name__
	# Check for common youtube_transcript_api specific errors not explicitly caught if any
	# Heuristic for bad ID
	if "video ID" in str(e).lower() or "parameter" in str(e).lower():
	return f"Error: Could not retrieve transcript for video ID '{video_id}'. It might be an invalid ID or the video is private/deleted. (API Error: {error_type})"
	return f"Error: An unexpected error occurred while fetching transcript for video ID '{video_id}': {error_type} - {str(e)}"


	if __name__ == '__main__':
	# Example Usage:
	test_video_url = "https://www.youtube.com/watch?v=1htKBjuUWec" # Stargate SG-1 Clip
	# Rick Astley (often has transcripts disabled or removed for fun)
	test_video_url_no_transcript = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
	test_video_id_only = "1htKBjuUWec"
	test_video_short_url = "https://youtu.be/1htKBjuUWec"
	test_video_invalid_id = "invalidID123"

	print(f"--- Testing with URL: {test_video_url} ---")
	transcript = get_youtube_video_transcript(test_video_url)
	if not transcript.startswith("Error:"):
	print(f"Transcript (first 300 chars): {transcript[:300]}...")
	else:
	print(transcript)

	print(
	f"\n--- Testing with Rick Astley URL: {test_video_url_no_transcript} ---")
	transcript_rick = get_youtube_video_transcript(
	test_video_url_no_transcript)
	print(transcript_rick) # Expected to often fail or have limited transcripts

	print(f"\n--- Testing with Video ID only: {test_video_id_only} ---")
	transcript_id = get_youtube_video_transcript(test_video_id_only)
	if not transcript_id.startswith("Error:"):
	print(f"Transcript (first 300 chars): {transcript_id[:300]}...")
	else:
	print(transcript_id)

	print(f"\n--- Testing with Short URL: {test_video_short_url} ---")
	transcript_short = get_youtube_video_transcript(test_video_short_url)
	if not transcript_short.startswith("Error:"):
	print(f"Transcript (first 300 chars): {transcript_short[:300]}...")
	else:
	print(transcript_short)

	print(f"\n--- Testing with Invalid Video ID: {test_video_invalid_id} ---")
	transcript_invalid = get_youtube_video_transcript(test_video_invalid_id)
	print(transcript_invalid)

	# Example of a video that might only have auto-generated transcripts or transcripts in other languages
	test_video_non_english_primary = "https://www.youtube.com/watch?v=xU7rVbnefj0"
	print(
	f"\n--- Testing with non-English primary video: {test_video_non_english_primary} ---")
	transcript_non_eng = get_youtube_video_transcript(
	test_video_non_english_primary)
	if not transcript_non_eng.startswith("Error:"):
	print(f"Transcript (first 300 chars): {transcript_non_eng[:300]}...")
	else:
	print(transcript_non_eng)