Spaces:

ignitariumcloud
/

NvidiaCanary-2.5

Sleeping

App Files Files Community

NvidiaCanary-2.5 / app.py

manueljohnson063

Update app.py

0e26843 verified 2 months ago

raw

history blame contribute delete

8.18 kB


	import os
	# Fix OpenMP environment variable issue
	os.environ['OMP_NUM_THREADS'] = '1'

	import gradio as gr
	from nemo.collections.speechlm2.models import SALM
	import torch
	import tempfile

	# Load model using official NVIDIA NeMo approach
	model_id = "nvidia/canary-qwen-2.5b"
	print("Loading NVIDIA Canary-Qwen-2.5B model using NeMo...")
	model = SALM.from_pretrained(model_id)

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)

	def generate_text(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
	"""Generate text using the NVIDIA NeMo model (LLM mode)"""

	try:
	# Use LLM mode (text-only) as per official documentation
	with model.llm.disable_adapter():
	answer_ids = model.generate(
	prompts=[[{"role": "user", "content": prompt}]],
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True
	)

	# Convert IDs to text using model's tokenizer
	# response = model.tokenizer.ids_to_text(answer_ids[0].cpu())
	response = model.tokenizer.ids_to_text(answer_ids[0].to(device))
	return response

	except Exception as e:
	return f"Error generating text: {str(e)}"

	def transcribe_audio(audio_file, user_prompt="Transcribe the following:"):
	"""Transcribe audio using ASR mode"""

	try:
	if audio_file is None:
	return "No audio file provided"

	# Use ASR mode (speech-to-text) as per official documentation
	answer_ids = model.generate(
	prompts=[
	[{"role": "user", "content": f"{user_prompt} {model.audio_locator_tag}", "audio": [audio_file]}]
	],
	max_new_tokens=128,
	)

	# Convert IDs to text
	# transcript = model.tokenizer.ids_to_text(answer_ids[0].cpu())
	transcript = model.tokenizer.ids_to_text(answer_ids[0].to(device))
	return transcript

	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	def chat_interface(message, history, max_tokens, temperature, top_p):
	"""Chat interface for Gradio"""

	# Build conversation context
	conversation = ""
	for user_msg, bot_msg in history:
	conversation += f"User: {user_msg}\nAssistant: {bot_msg}\n"

	conversation += f"User: {message}\nAssistant: "

	# Generate response
	response = generate_text(conversation, max_tokens, temperature, top_p)

	# Update history
	history.append((message, response))

	return "", history

	# Create Gradio interface
	with gr.Blocks(title="NVIDIA Canary-Qwen-2.5B Chat") as demo:

	gr.HTML("""
	<div style="text-align: center;">
	<h1>🤖 NVIDIA Canary-Qwen-2.5B</h1>
	<p>Official NeMo implementation - Speech-to-Text & Text Generation</p>
	<p><strong>Capabilities:</strong> Audio Transcription + Text Chat</p>
	</div>
	""")

	with gr.Tab("🎤 Audio Transcription (ASR)"):
	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="Upload Audio File (.wav or .flac)",
	type="filepath",
	format="wav"
	)

	asr_prompt = gr.Textbox(
	label="Custom Prompt (optional)",
	value="Transcribe the following:",
	placeholder="Enter custom transcription prompt..."
	)

	transcribe_btn = gr.Button("🎤 Transcribe Audio", variant="primary")

	transcript_output = gr.Textbox(
	label="Transcription Result",
	lines=8,
	max_lines=15
	)

	gr.Examples(
	examples=[
	["Transcribe the following:"],
	["Please transcribe this audio in detail:"],
	["Convert this speech to text:"]
	],
	inputs=[asr_prompt]
	)

	with gr.Tab("💬 Text Chat (LLM)"):
	with gr.Row():
	with gr.Column(scale=3):
	chatbot = gr.Chatbot(height=400)
	msg = gr.Textbox(label="Your message", placeholder="Type here...")

	with gr.Row():
	submit_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear Chat")

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Settings")

	max_tokens = gr.Slider(
	minimum=10, maximum=500, value=200, step=10,
	label="Max Tokens"
	)

	temperature = gr.Slider(
	minimum=0.1, maximum=2.0, value=0.7, step=0.1,
	label="Temperature"
	)

	top_p = gr.Slider(
	minimum=0.1, maximum=1.0, value=0.9, step=0.05,
	label="Top-p"
	)

	with gr.Tab("📝 Single Generation"):
	with gr.Column():
	prompt_input = gr.Textbox(
	label="Prompt",
	placeholder="Enter your prompt...",
	lines=5
	)

	generate_btn = gr.Button("Generate", variant="primary")

	output_text = gr.Textbox(
	label="Generated Text",
	lines=10,
	max_lines=20
	)

	with gr.Row():
	single_max_tokens = gr.Slider(10, 500, 200, label="Max Tokens")
	single_temperature = gr.Slider(0.1, 2.0, 0.7, label="Temperature")
	single_top_p = gr.Slider(0.1, 1.0, 0.9, label="Top-p")

	with gr.Tab("ℹ️ Model Info"):
	gr.Markdown("""
	## NVIDIA Canary-Qwen-2.5B Model Information

	### Capabilities:
	- 🎤 Audio Transcription (ASR): Convert speech to text
	- 💬 Text Generation (LLM): Chat and text completion
	- 🎯 Multimodal: Combines audio and text processing

	### Model Details:
	- Size: 2.5 billion parameters
	- Framework: NVIDIA NeMo
	- Audio Input: 16kHz mono-channel .wav or .flac files
	- Languages: Multiple languages supported

	### Usage Tips:
	1. For Audio: Upload .wav or .flac files (16kHz recommended)
	2. For Text: Use natural language prompts
	3. Custom Prompts: You can modify transcription prompts
	4. Parameters: Adjust temperature and tokens for different outputs

	### Official Documentation:
	- [Model Card](https://huggingface.co/nvidia/canary-qwen-2.5b)
	- [NVIDIA NeMo](https://github.com/NVIDIA/NeMo)
	""")

	# Event handlers
	transcribe_btn.click(
	transcribe_audio,
	inputs=[audio_input, asr_prompt],
	outputs=[transcript_output]
	)

	# Event handlers
	submit_btn.click(
	chat_interface,
	inputs=[msg, chatbot, max_tokens, temperature, top_p],
	outputs=[msg, chatbot]
	)

	msg.submit(
	chat_interface,
	inputs=[msg, chatbot, max_tokens, temperature, top_p],
	outputs=[msg, chatbot]
	)

	clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])

	generate_btn.click(
	generate_text,
	inputs=[prompt_input, single_max_tokens, single_temperature, single_top_p],
	outputs=[output_text]
	)

	# Example prompts
	gr.Examples(
	examples=[
	["Explain quantum computing in simple terms"],
	["Write a short story about AI"],
	["What are the benefits of renewable energy?"],
	["How do neural networks work?"],
	["Summarize the key points about machine learning"]
	],
	inputs=[prompt_input]
	)

	if __name__ == "__main__":
	demo.launch(share=True)