transcribe_audio

Running

App Files Files Community

cstr commited on Oct 2, 2024

Commit

e922c51

verified ·

1 Parent(s): 4f55f4b

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -18

app.py CHANGED Viewed

@@ -173,7 +173,23 @@ def wget_method(url):
 def trim_audio(audio_path, start_time, end_time):
     logging.info(f"Trimming audio from {start_time} to {end_time}")
     audio = AudioSegment.from_file(audio_path)
-    trimmed_audio = audio[start_time*1000:end_time*1000] if end_time else audio[start_time*1000:]
     trimmed_audio_path = tempfile.mktemp(suffix='.wav')
     trimmed_audio.export(trimmed_audio_path, format="wav")
     logging.info(f"Trimmed audio saved to: {trimmed_audio_path}")
@@ -197,6 +213,20 @@ def get_model_options(pipeline_type):
         return []
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
         logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, dtype={dtype}, batch_size={batch_size}, download_method={download_method}")
         verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nData Type: {dtype}\nBatch Size: {batch_size}\nDownload Method: {download_method}\n"
@@ -243,8 +273,11 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         else:
             audio_path = input_source
         if start_time is not None or end_time is not None:
-            trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
             audio_path = trimmed_audio_path
             verbose_messages += f"Audio trimmed from {start_time} to {end_time}\n"
             if verbose:
@@ -289,23 +322,23 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         yield f"An error occurred: {str(e)}", "", None
     finally:
-        if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
-            try:
-                os.remove(audio_path)
-            except:
-                pass
-        if start_time is not None or end_time is not None:
-            try:
-                os.remove(trimmed_audio_path)
-            except:
-                pass
 with gr.Blocks() as iface:
     gr.Markdown("# Multi-Pipeline Transcription")
     gr.Markdown("Transcribe audio using multiple pipelines and models.")
     with gr.Row():
-        input_source = gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)")
         pipeline_type = gr.Dropdown(
             choices=["faster-batched", "faster-sequenced", "transformers"],
             label="Pipeline Type",
@@ -327,8 +360,8 @@ with gr.Blocks() as iface:
         )
     with gr.Row():
-        start_time = gr.Number(label="Start Time (seconds)", value=0)
-        end_time = gr.Number(label="End Time (seconds)", value=0)
         verbose = gr.Checkbox(label="Verbose Output", value=True)  # Set to True by default
     transcribe_button = gr.Button("Transcribe")
@@ -366,9 +399,9 @@ with gr.Blocks() as iface:
     gr.Examples(
         examples=[
-            ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", 0, None, True],
-            ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
-            ["path/to/local/audio.mp3", "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, True]
         ],
         inputs=[input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time, end_time, verbose],
     )

 def trim_audio(audio_path, start_time, end_time):
     logging.info(f"Trimming audio from {start_time} to {end_time}")
     audio = AudioSegment.from_file(audio_path)
+    audio_duration = len(audio) / 1000  # Duration in seconds
+    # Default start and end times if None
+    if start_time is None:
+        start_time = 0
+    if end_time is None or end_time > audio_duration:
+        end_time = audio_duration
+    # Validate times
+    if start_time < 0 or end_time < 0:
+        raise ValueError("Start time and end time must be non-negative.")
+    if start_time >= end_time:
+        raise gr.Error("End time must be greater than start time.")
+    if start_time > audio_duration:
+        raise ValueError("Start time exceeds audio duration.")
+    trimmed_audio = audio[start_time * 1000:end_time * 1000]
     trimmed_audio_path = tempfile.mktemp(suffix='.wav')
     trimmed_audio.export(trimmed_audio_path, format="wav")
     logging.info(f"Trimmed audio saved to: {trimmed_audio_path}")
         return []
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
+    try:
+        # Determine if input_source is a URL or file
+        if isinstance(input_source, str):
+            if input_source.startswith('http://') or input_source.startswith('https://'):
+                audio_path = download_audio(input_source, download_method)
+                # Handle potential errors during download
+                if not audio_path or audio_path.startswith("Error"):
+                    yield f"Error: {audio_path}", "", None
+                    return
+        else:
+            # Assume input_source is an uploaded file object
+            audio_path = input_source.name
+            logging.info(f"Using uploaded audio file: {audio_path}")
     try:
         logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, dtype={dtype}, batch_size={batch_size}, download_method={download_method}")
         verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nData Type: {dtype}\nBatch Size: {batch_size}\nDownload Method: {download_method}\n"
         else:
             audio_path = input_source
+        start_time = float(start_time) if start_time else None
+        end_time = float(end_time) if end_time else None
         if start_time is not None or end_time is not None:
+            trimmed_audio_path = trim_audio(audio_path, start_time, end_time)
             audio_path = trimmed_audio_path
             verbose_messages += f"Audio trimmed from {start_time} to {end_time}\n"
             if verbose:
         yield f"An error occurred: {str(e)}", "", None
     finally:
+        # Remove downloaded audio file
+        if audio_path and os.path.exists(audio_path):
+            os.remove(audio_path)
+        # Remove trimmed audio file
+        if 'trimmed_audio_path' in locals() and os.path.exists(trimmed_audio_path):
+            os.remove(trimmed_audio_path)
+        # Remove transcription file if needed
+        if transcription_file and os.path.exists(transcription_file):
+            os.remove(transcription_file)
 with gr.Blocks() as iface:
     gr.Markdown("# Multi-Pipeline Transcription")
     gr.Markdown("Transcribe audio using multiple pipelines and models.")
     with gr.Row():
+        input_source = gr.File(label="Audio Source (Upload a file or enter a URL/YouTube URL)")
         pipeline_type = gr.Dropdown(
             choices=["faster-batched", "faster-sequenced", "transformers"],
             label="Pipeline Type",
         )
     with gr.Row():
+        start_time = gr.Number(label="Start Time (seconds)", value=None, minimum=0)
+        end_time = gr.Number(label="End Time (seconds)", value=None, minimum=0)
         verbose = gr.Checkbox(label="Verbose Output", value=True)  # Set to True by default
     transcribe_button = gr.Button("Transcribe")
     gr.Examples(
         examples=[
+        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", None, None, True],
+        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
+        [None, "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, True]
         ],
         inputs=[input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time, end_time, verbose],
     )