Update generate_slideshow.py
Browse files- generate_slideshow.py +23 -25
generate_slideshow.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Generates slide markdown plus TTS audio and images using Gemini models.
|
| 4 |
-
|
| 5 |
Functions exposed:
|
| 6 |
-
generate_slideshow_with_audio(topic) -> (list_of_slide_markdown, list_of_audio_paths, list_of_image_paths)
|
| 7 |
"""
|
| 8 |
|
| 9 |
import asyncio
|
|
@@ -38,7 +37,7 @@ except ImportError:
|
|
| 38 |
print("Deepgram SDK not available. Install with 'pip install deepgram-sdk'")
|
| 39 |
DEEPGRAM_AVAILABLE = False
|
| 40 |
|
| 41 |
-
|
| 42 |
DEEPGRAM_KEY = os.environ.get("DEEPGRAM_KEY")
|
| 43 |
|
| 44 |
# Dictionary to store temporary directories for cleanup
|
|
@@ -175,9 +174,9 @@ def _extract_markdown_slides(markdown: str) -> list[dict]:
|
|
| 175 |
|
| 176 |
|
| 177 |
# ββββββββββββββββββββββββββββ Gemini Calls βββββββββββββββββββββββββββ
|
| 178 |
-
async def _generate_image(prompt: str, output_path: Path) -> str:
|
| 179 |
"""Generate an image using Gemini Imagen model and save it to the specified path."""
|
| 180 |
-
client = genai.Client(api_key=
|
| 181 |
|
| 182 |
try:
|
| 183 |
# Make this call in a separate thread to not block the event loop
|
|
@@ -210,9 +209,9 @@ async def _generate_image(prompt: str, output_path: Path) -> str:
|
|
| 210 |
print(f"Error generating image: {e}")
|
| 211 |
return ""
|
| 212 |
|
| 213 |
-
def _generate_slideshow_markdown(topic: str) -> str:
|
| 214 |
"""Ask Gemini 2.5 Flash for a markdown deck following strict rules."""
|
| 215 |
-
client = genai.Client(api_key=
|
| 216 |
#model = "gemini-2.5-flash-preview-05-20"
|
| 217 |
model = "gemini-2.5-pro-preview-06-05"
|
| 218 |
|
|
@@ -220,21 +219,17 @@ def _generate_slideshow_markdown(topic: str) -> str:
|
|
| 220 |
<role>
|
| 221 |
You are SlideGen, an AI that creates fun and engaging narrated slide decks with visual elements about various topics.
|
| 222 |
</role>
|
| 223 |
-
|
| 224 |
<instructions>
|
| 225 |
Create a presentation about '{topic}'.
|
| 226 |
Include:
|
| 227 |
- An introduction slide with bullet points about the overview of the presentation topic and the key areas that will be covered
|
| 228 |
- 3 content slides with bullet points
|
| 229 |
- A conclusion slide with bullet points summarizing the key points and insights.
|
| 230 |
-
|
| 231 |
For each slide provide:
|
| 232 |
1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
|
| 233 |
2. 3-4 concise bullet points, you will go into more detail in the speaker notes.
|
| 234 |
3. Clear prose speaker notes suitable for narration that is accessible to general audiences
|
| 235 |
4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content. Do not include any text in the image.
|
| 236 |
-
|
| 237 |
-
|
| 238 |
Respond with a JSON array where each element represents a slide in the following format:
|
| 239 |
```json
|
| 240 |
[
|
|
@@ -275,9 +270,9 @@ Respond with a JSON array where each element represents a slide in the following
|
|
| 275 |
return response.text.strip()
|
| 276 |
|
| 277 |
|
| 278 |
-
async def _generate_tts(narration: str, out_path: Path):
|
| 279 |
"""GenAI TTS β WAV - Async version with fallback model support"""
|
| 280 |
-
client = genai.Client(api_key=
|
| 281 |
|
| 282 |
# Try with flash model first, then fall back to pro model if needed
|
| 283 |
models_to_try = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-06-05"]
|
|
@@ -387,17 +382,22 @@ def _generate_tts_with_deepgram(narration: str, out_path: Path):
|
|
| 387 |
|
| 388 |
|
| 389 |
# ββββββββββββββββββββββββ Public Entry Point βββββββββββββββββββ
|
| 390 |
-
async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
| 391 |
"""
|
| 392 |
Async version of generate_slideshow_with_audio that processes slides concurrently.
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
Returns:
|
| 395 |
slides_md : list[str] β markdown for each slide
|
| 396 |
audio : list[str] β file paths (one per slide, same order)
|
| 397 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
| 398 |
"""
|
| 399 |
# Get JSON response from Gemini
|
| 400 |
-
json_response = _generate_slideshow_markdown(topic)
|
| 401 |
|
| 402 |
# Parse JSON into slides data
|
| 403 |
slides_data = _parse_slides_json(json_response)
|
|
@@ -439,7 +439,7 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
|
| 439 |
# Schedule TTS task
|
| 440 |
if narration:
|
| 441 |
print(f"Scheduling TTS for slide {i} -> {wav_path}")
|
| 442 |
-
tts_tasks.append(_generate_tts(narration, wav_path))
|
| 443 |
else:
|
| 444 |
# Create empty placeholder WAV if no narration
|
| 445 |
with open(wav_path, "wb") as f:
|
|
@@ -457,7 +457,7 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
|
| 457 |
image_path = pres_dir / f"{safe_topic}_slide_{i:02d}_image.jpg"
|
| 458 |
print(f"Scheduling image for slide {i} -> {image_path}")
|
| 459 |
# Store task with index to track which slide it belongs to
|
| 460 |
-
image_tasks.append((i-1, _generate_image(image_prompt, image_path)))
|
| 461 |
else:
|
| 462 |
print(f"No image prompt for slide {i}, skipping image generation.")
|
| 463 |
|
|
@@ -491,13 +491,14 @@ async def generate_slideshow_with_audio_async(topic: str, **kwargs):
|
|
| 491 |
return slides_md, audio_files, slide_images
|
| 492 |
|
| 493 |
|
| 494 |
-
def generate_slideshow_with_audio(topic: str, **kwargs):
|
| 495 |
"""
|
| 496 |
Synchronous wrapper for the async slideshow generation function.
|
| 497 |
Maintains backward compatibility with existing code.
|
| 498 |
|
| 499 |
Args:
|
| 500 |
topic: The topic to generate a slideshow about
|
|
|
|
| 501 |
**kwargs: Optional parameters including:
|
| 502 |
- session_id: Unique identifier for the user session
|
| 503 |
|
|
@@ -506,25 +507,22 @@ def generate_slideshow_with_audio(topic: str, **kwargs):
|
|
| 506 |
audio : list[str] β file paths (one per slide, same order)
|
| 507 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
| 508 |
"""
|
| 509 |
-
return asyncio.run(generate_slideshow_with_audio_async(topic, **kwargs))
|
| 510 |
|
| 511 |
|
| 512 |
-
def validate_topic(topic: str) -> bool:
|
| 513 |
"""Use Gemini Flash Preview to determine if a topic is suitable for a slideshow."""
|
| 514 |
-
client = genai.Client(api_key=
|
| 515 |
system_prompt = f'''
|
| 516 |
<role>
|
| 517 |
You are SlideGenInputGuard, an AI assistant that determines if a user input is a suitable topic for a narrated slideshow presentation.
|
| 518 |
</role>
|
| 519 |
-
|
| 520 |
<instructions>
|
| 521 |
Evaluate if "{topic}" is a real-world topic, question, or concept suitable for an educational slideshow. It is fine to include topics that are silly and not real-world topics.
|
| 522 |
If it is a valid topic, respond with exactly: 1
|
| 523 |
If it is nonsense, gibberish, meaningless, empty, or not a valid topic, respond with exactly: 0
|
| 524 |
-
|
| 525 |
Only respond with a single digit: 1 or 0. No spaces, newlines or explanations. JUST THE NUMBER 1 OR 0.
|
| 526 |
</instructions>
|
| 527 |
-
|
| 528 |
<examples>
|
| 529 |
Input:How does lightning form?
|
| 530 |
Output:1
|
|
@@ -559,4 +557,4 @@ Output:0
|
|
| 559 |
config=types.GenerateContentConfig(response_mime_type="text/plain", temperature=0),
|
| 560 |
)
|
| 561 |
result = response.text.strip()
|
| 562 |
-
return result == "1"
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Generates slide markdown plus TTS audio and images using Gemini models.
|
|
|
|
| 4 |
Functions exposed:
|
| 5 |
+
generate_slideshow_with_audio(topic, api_key) -> (list_of_slide_markdown, list_of_audio_paths, list_of_image_paths)
|
| 6 |
"""
|
| 7 |
|
| 8 |
import asyncio
|
|
|
|
| 37 |
print("Deepgram SDK not available. Install with 'pip install deepgram-sdk'")
|
| 38 |
DEEPGRAM_AVAILABLE = False
|
| 39 |
|
| 40 |
+
# Remove the global API key - it will be passed as parameter
|
| 41 |
DEEPGRAM_KEY = os.environ.get("DEEPGRAM_KEY")
|
| 42 |
|
| 43 |
# Dictionary to store temporary directories for cleanup
|
|
|
|
| 174 |
|
| 175 |
|
| 176 |
# ββββββββββββββββββββββββββββ Gemini Calls βββββββββββββββββββββββββββ
|
| 177 |
+
async def _generate_image(prompt: str, output_path: Path, api_key: str) -> str:
|
| 178 |
"""Generate an image using Gemini Imagen model and save it to the specified path."""
|
| 179 |
+
client = genai.Client(api_key=api_key)
|
| 180 |
|
| 181 |
try:
|
| 182 |
# Make this call in a separate thread to not block the event loop
|
|
|
|
| 209 |
print(f"Error generating image: {e}")
|
| 210 |
return ""
|
| 211 |
|
| 212 |
+
def _generate_slideshow_markdown(topic: str, api_key: str) -> str:
|
| 213 |
"""Ask Gemini 2.5 Flash for a markdown deck following strict rules."""
|
| 214 |
+
client = genai.Client(api_key=api_key)
|
| 215 |
#model = "gemini-2.5-flash-preview-05-20"
|
| 216 |
model = "gemini-2.5-pro-preview-06-05"
|
| 217 |
|
|
|
|
| 219 |
<role>
|
| 220 |
You are SlideGen, an AI that creates fun and engaging narrated slide decks with visual elements about various topics.
|
| 221 |
</role>
|
|
|
|
| 222 |
<instructions>
|
| 223 |
Create a presentation about '{topic}'.
|
| 224 |
Include:
|
| 225 |
- An introduction slide with bullet points about the overview of the presentation topic and the key areas that will be covered
|
| 226 |
- 3 content slides with bullet points
|
| 227 |
- A conclusion slide with bullet points summarizing the key points and insights.
|
|
|
|
| 228 |
For each slide provide:
|
| 229 |
1. Each title should be a single concise and coherent phrase accompanied by exactly one relevant emoji. (Do NOT use the colon ":" format for titles)
|
| 230 |
2. 3-4 concise bullet points, you will go into more detail in the speaker notes.
|
| 231 |
3. Clear prose speaker notes suitable for narration that is accessible to general audiences
|
| 232 |
4. A detailed and specific image prompt for an AI image generator that is relevent to the slide's content. Do not include any text in the image.
|
|
|
|
|
|
|
| 233 |
Respond with a JSON array where each element represents a slide in the following format:
|
| 234 |
```json
|
| 235 |
[
|
|
|
|
| 270 |
return response.text.strip()
|
| 271 |
|
| 272 |
|
| 273 |
+
async def _generate_tts(narration: str, out_path: Path, api_key: str):
|
| 274 |
"""GenAI TTS β WAV - Async version with fallback model support"""
|
| 275 |
+
client = genai.Client(api_key=api_key)
|
| 276 |
|
| 277 |
# Try with flash model first, then fall back to pro model if needed
|
| 278 |
models_to_try = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-06-05"]
|
|
|
|
| 382 |
|
| 383 |
|
| 384 |
# ββββββββββββββββββββββββ Public Entry Point βββββββββββββββββββ
|
| 385 |
+
async def generate_slideshow_with_audio_async(topic: str, api_key: str, **kwargs):
|
| 386 |
"""
|
| 387 |
Async version of generate_slideshow_with_audio that processes slides concurrently.
|
| 388 |
|
| 389 |
+
Args:
|
| 390 |
+
topic: The topic to generate a slideshow about
|
| 391 |
+
api_key: Gemini API key
|
| 392 |
+
**kwargs: Optional parameters including session_id
|
| 393 |
+
|
| 394 |
Returns:
|
| 395 |
slides_md : list[str] β markdown for each slide
|
| 396 |
audio : list[str] β file paths (one per slide, same order)
|
| 397 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
| 398 |
"""
|
| 399 |
# Get JSON response from Gemini
|
| 400 |
+
json_response = _generate_slideshow_markdown(topic, api_key)
|
| 401 |
|
| 402 |
# Parse JSON into slides data
|
| 403 |
slides_data = _parse_slides_json(json_response)
|
|
|
|
| 439 |
# Schedule TTS task
|
| 440 |
if narration:
|
| 441 |
print(f"Scheduling TTS for slide {i} -> {wav_path}")
|
| 442 |
+
tts_tasks.append(_generate_tts(narration, wav_path, api_key))
|
| 443 |
else:
|
| 444 |
# Create empty placeholder WAV if no narration
|
| 445 |
with open(wav_path, "wb") as f:
|
|
|
|
| 457 |
image_path = pres_dir / f"{safe_topic}_slide_{i:02d}_image.jpg"
|
| 458 |
print(f"Scheduling image for slide {i} -> {image_path}")
|
| 459 |
# Store task with index to track which slide it belongs to
|
| 460 |
+
image_tasks.append((i-1, _generate_image(image_prompt, image_path, api_key)))
|
| 461 |
else:
|
| 462 |
print(f"No image prompt for slide {i}, skipping image generation.")
|
| 463 |
|
|
|
|
| 491 |
return slides_md, audio_files, slide_images
|
| 492 |
|
| 493 |
|
| 494 |
+
def generate_slideshow_with_audio(topic: str, api_key: str, **kwargs):
|
| 495 |
"""
|
| 496 |
Synchronous wrapper for the async slideshow generation function.
|
| 497 |
Maintains backward compatibility with existing code.
|
| 498 |
|
| 499 |
Args:
|
| 500 |
topic: The topic to generate a slideshow about
|
| 501 |
+
api_key: Gemini API key
|
| 502 |
**kwargs: Optional parameters including:
|
| 503 |
- session_id: Unique identifier for the user session
|
| 504 |
|
|
|
|
| 507 |
audio : list[str] β file paths (one per slide, same order)
|
| 508 |
images : list[str|None] β file paths for slide images (one per slide, same order)
|
| 509 |
"""
|
| 510 |
+
return asyncio.run(generate_slideshow_with_audio_async(topic, api_key, **kwargs))
|
| 511 |
|
| 512 |
|
| 513 |
+
def validate_topic(topic: str, api_key: str) -> bool:
|
| 514 |
"""Use Gemini Flash Preview to determine if a topic is suitable for a slideshow."""
|
| 515 |
+
client = genai.Client(api_key=api_key)
|
| 516 |
system_prompt = f'''
|
| 517 |
<role>
|
| 518 |
You are SlideGenInputGuard, an AI assistant that determines if a user input is a suitable topic for a narrated slideshow presentation.
|
| 519 |
</role>
|
|
|
|
| 520 |
<instructions>
|
| 521 |
Evaluate if "{topic}" is a real-world topic, question, or concept suitable for an educational slideshow. It is fine to include topics that are silly and not real-world topics.
|
| 522 |
If it is a valid topic, respond with exactly: 1
|
| 523 |
If it is nonsense, gibberish, meaningless, empty, or not a valid topic, respond with exactly: 0
|
|
|
|
| 524 |
Only respond with a single digit: 1 or 0. No spaces, newlines or explanations. JUST THE NUMBER 1 OR 0.
|
| 525 |
</instructions>
|
|
|
|
| 526 |
<examples>
|
| 527 |
Input:How does lightning form?
|
| 528 |
Output:1
|
|
|
|
| 557 |
config=types.GenerateContentConfig(response_mime_type="text/plain", temperature=0),
|
| 558 |
)
|
| 559 |
result = response.text.strip()
|
| 560 |
+
return result == "1"
|