Spaces:

mikr
/

w2v-bert2-czech

Sleeping

App Files Files Community

mikr commited on Feb 4, 2024

Commit

1808ded

1 Parent(s): 7df6e8c

working demo

Browse files

Files changed (2) hide show

app.py +23 -37
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,64 +1,50 @@
 import gradio as gr
 import soundfile as sf
 import torch
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
 MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"
-lang = "cs"
 device = 0 if torch.cuda.is_available() else "cpu"
-model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device)
-processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-pipe = pipeline(
-    model=MODEL_NAME,
-)
-def transcribe(file_upload):
-    warn_output = ""
-    if (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    file = file_upload
-    text = pipe(file)["text"]
-    return warn_output + text
-def readwav(a_f):
-    wav, sr = sf.read(a_f, dtype=np.float32)
-    if len(wav.shape) == 2:
-        wav = wav.mean(1)
-    if sr != 16000:
-        wlen = int(wav.shape[0] / sr * 16000)
-        wav = signal.resample(wav, wlen)
-    return wav
-def transcribe2(file_upload):
-    wav = readwav(file_upload)
-    with torch.inference_mode():
-        input_values = processor(wav, sampling_rate=16000).input_values[0]
-        input_values = torch.tensor(input_values, device=device).unsqueeze(0)
-        logits = model(input_values).logits
-        pred_ids = torch.argmax(logits, dim=-1)
-        xcp = processor.batch_decode(pred_ids)
-        return xcp[0]
 iface = gr.Interface(
-    fn=transcribe2,
     inputs=[
-        gr.File(type="binary", label="Upload Audio File"),  # Audio file upload
     ],
     outputs="text",
     theme="huggingface",
-    title="Wav2Vec2-Bert demo - transcribe Czech Audio",
     description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the fine-tuned"
-        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Whisper Fine Tuning Sprint Event 2022 "
         "and 🤗 Transformers to transcribe audio files of arbitrary length."
     ),
     allow_flagging="never",
 )
-iface.launch()

 import gradio as gr
 import soundfile as sf
 import torch
+import numpy as np
+import librosa
+from transformers import AutoProcessor, Wav2Vec2BertForCTC
 MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"
 device = 0 if torch.cuda.is_available() else "cpu"
+print("device:",device)
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = Wav2Vec2BertForCTC.from_pretrained(MODEL_NAME).to(device)
+def transcribe(audio_path):
+    a, s = librosa.load(audio_path, sr=16_000)
+    # inputs = processor(a, sampling_rate=s, return_tensors="pt")
+    input_values = processor(a, sampling_rate=s, return_tensors="pt").input_features
+    with torch.no_grad():
+        logits = model(input_values.to(device)).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # transcribe speech
+    transcription = processor.batch_decode(predicted_ids)
+    return transcription[0]
 iface = gr.Interface(
+    fn=transcribe,
     inputs=[
+        gr.File(type="filepath", label="Upload Audio File"),  # Audio file upload
     ],
     outputs="text",
     theme="huggingface",
+    title="Czech W2v-BERT 2.0 speech encoder demo - transcribe Czech Audio",
     description=(
+        "Transcribe audio inputs with the click of a button! Demo uses the fine-tuned"
+        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Facebook W2v-BERT 2.0 speech encoder "
         "and 🤗 Transformers to transcribe audio files of arbitrary length."
     ),
     allow_flagging="never",
 )
+iface.launch(server_name="0.0.0.0")

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 git+https://github.com/huggingface/transformers
 torch
 soundfile

 git+https://github.com/huggingface/transformers
 torch
 soundfile
+librosa
+ffmpy