|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | """ | 
					
						
						|  | Example script for audio transcription using the model. | 
					
						
						|  |  | 
					
						
						|  | This script demonstrates how to: | 
					
						
						|  | 1. Load the model and processor | 
					
						
						|  | 2. Configure audio processing parameters | 
					
						
						|  | 3. Process audio input | 
					
						
						|  | 4. Generate transcription output | 
					
						
						|  |  | 
					
						
						|  | Usage: | 
					
						
						|  | python example_mini_audio.py --model_path <path_to_model> --audio_path <path_to_audio> | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | from transformers import AutoProcessor, AutoModel, AutoConfig, AutoModelForCausalLM | 
					
						
						|  | import torch | 
					
						
						|  | import os | 
					
						
						|  | import argparse | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | parser = argparse.ArgumentParser(description="Audio transcription example") | 
					
						
						|  | parser.add_argument("--model_path", type=str, default="./", help="Path to the model") | 
					
						
						|  | parser.add_argument("--audio_path", type=str, required=True, help="Path to the audio file") | 
					
						
						|  | parser.add_argument("--max_new_tokens", type=int, default=1024, help="Maximum number of tokens to generate") | 
					
						
						|  | parser.add_argument("--num_video_frames", type=int, default=128, help="Number of video frames to process") | 
					
						
						|  | parser.add_argument("--audio_length", type=str, default="max_3600", help="Maximum audio length") | 
					
						
						|  |  | 
					
						
						|  | args = parser.parse_args() | 
					
						
						|  |  | 
					
						
						|  | model_path = args.model_path | 
					
						
						|  | audio_path = args.audio_path | 
					
						
						|  | generation_kwargs = {"max_new_tokens": args.max_new_tokens, "max_length": 99999999} | 
					
						
						|  | load_audio_in_video = True | 
					
						
						|  | num_video_frames = args.num_video_frames | 
					
						
						|  | audio_length = args.audio_length | 
					
						
						|  |  | 
					
						
						|  | config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) | 
					
						
						|  |  | 
					
						
						|  | model = AutoModel.from_pretrained(model_path, | 
					
						
						|  | trust_remote_code=True, | 
					
						
						|  | torch_dtype="torch.float16", | 
					
						
						|  | device_map="auto") | 
					
						
						|  |  | 
					
						
						|  | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | 
					
						
						|  | generation_config = model.default_generation_config | 
					
						
						|  | generation_config.update(**generation_kwargs) | 
					
						
						|  |  | 
					
						
						|  | model.config.load_audio_in_video = load_audio_in_video | 
					
						
						|  | processor.config.load_audio_in_video = load_audio_in_video | 
					
						
						|  | if num_video_frames > 0: | 
					
						
						|  | model.config.num_video_frames = num_video_frames | 
					
						
						|  | processor.config.num_video_frames = num_video_frames | 
					
						
						|  | if audio_length != -1: | 
					
						
						|  | model.config.audio_chunk_length = audio_length | 
					
						
						|  | processor.config.audio_chunk_length = audio_length | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | conversation = [{ | 
					
						
						|  | "role": "user", | 
					
						
						|  | "content": [ | 
					
						
						|  | {"type": "audio", "audio": audio_path}, | 
					
						
						|  | {"type": "text", "text": "Transcribe the whole speech."} | 
					
						
						|  | ] | 
					
						
						|  | }] | 
					
						
						|  | text = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) | 
					
						
						|  |  | 
					
						
						|  | inputs = processor([text]) | 
					
						
						|  |  | 
					
						
						|  | output_ids = model.generate( | 
					
						
						|  | input_ids=inputs.input_ids, | 
					
						
						|  | media=getattr(inputs, 'media', None), | 
					
						
						|  | media_config=getattr(inputs, 'media_config', None), | 
					
						
						|  | generation_config=generation_config, | 
					
						
						|  | ) | 
					
						
						|  | print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True)) |