Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +61 -3
example.wav +0 -0
hyperparams.yaml +134 -0
model.ckpt +3 -0
normalize.ckpt +3 -0
tokenizer.ckpt +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,61 @@
----
-license: apache-2.0
----

+---
+language:
+  - fon
+thumbnail: null
+tags:
+- automatic-speech-recognition
+- CTC
+- Attention
+- Transformer
+- Conformer
+- pytorch
+- speechbrain
+license: apache-2.0
+datasets:
+- beethogedeon/fongbe-speech
+metrics:
+- wer
+- cer
+---
+# Fongbe ASR model w/out diacritics
+### How to use for inference
+```python
+from speechbrain.inference.ASR import EncoderASR
+asr_model = EncoderASR.from_hparams(
+    source="whettenr/asr-fon-with-diacritics",
+    savedir="pretrained_models/asr-fongbe-with-diacritics"
+)
+asr_model.transcribe_file("whettenr/asr-fon-with-diacritics/example.wav")
+# expected output:
+# huzuhuzu gɔngɔn ɖé ɖò dandan
+```
+### Details of model
+~100M parameters, 12 layer conformer encoder, FFNN decoder
+### Details of training
+- pretrained using BEST-RQ on 140 hours
+    - FFSTC 2 + beethogedeon/fongbe-speech (~40 hours)
+    - cappfm (~100 hours)
+- finetuned with CTC loss on training sets of
+    - FFSTC 2
+    - beethogedeon/fongbe-speech
+```
+@inproceedings{kponou25_interspeech,
+  title     = {{Extending the Fongbe to French Speech Translation Corpus:  resources, models and benchmark}},
+  author    = {D. Fortuné Kponou and Salima Mdhaffar and Fréjus A. A. Laleye and Eugène C. Ezin and Yannick Estève},
+  year      = {2025},
+  booktitle = {{Interspeech 2025}},
+  pages     = {4533--4537},
+  doi       = {10.21437/Interspeech.2025-1801},
+  issn      = {2958-1796},
+}
+```

example.wav ADDED Viewed

Binary file (98.4 kB). View file

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,134 @@

+# ################################
+# Model: bestRQ + DNN + CTC
+# Authors: Ryan Whetten 2025
+# ################################
+####################### Model Parameters ###############################
+# Feature parameters
+sample_rate: 16000
+n_fft: 400
+n_mels: 80
+# Transformer
+d_model: 640
+nhead: 8
+num_encoder_layers: 12
+num_decoder_layers: 0
+d_ffn: 2048
+transformer_dropout: 0.1
+activation: !name:torch.nn.GELU
+output_neurons: 5000
+attention_type: RoPEMHA
+encoder_module: conformer
+dnn_activation: !new:torch.nn.LeakyReLU
+# FFNN + output
+dnn_neurons: 1024
+dnn_dropout: 0.15
+output_neurons_ctc: 60
+blank_index: 0
+bos_index: 1
+eos_index: 2
+# normalizing
+normalize: !new:speechbrain.processing.features.InputNormalization
+   norm_type: sentence
+# fbanks
+compute_features: !new:speechbrain.lobes.features.Fbank
+   sample_rate: !ref <sample_rate>
+   n_fft: !ref <n_fft>
+   n_mels: !ref <n_mels>
+############################## models ##########################################
+CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
+   input_shape: (8, 10, 80)
+   num_blocks: 2
+   num_layers_per_block: 1
+   out_channels: (128, 32)
+   kernel_sizes: (5, 5)
+   strides: (2, 2)
+   residuals: (False, False)
+Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+   input_size: 640
+   tgt_vocab: !ref <output_neurons>
+   d_model: !ref <d_model>
+   nhead: !ref <nhead>
+   num_encoder_layers: !ref <num_encoder_layers>
+   num_decoder_layers: !ref <num_decoder_layers>
+   d_ffn: !ref <d_ffn>
+   dropout: !ref <transformer_dropout>
+   activation: !ref <activation>
+   conformer_activation: !ref <activation>
+   encoder_module: !ref <encoder_module>
+   attention_type: !ref <attention_type>
+   normalize_before: True
+   causal: False
+# We must call an encoder wrapper so the decoder isn't run (we don't have any)
+enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
+   transformer: !ref <Transformer>
+back_end_ffn: !new:speechbrain.nnet.containers.Sequential
+    input_shape: [null, null, !ref <d_model>]
+    linear1: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation: !new:torch.nn.LeakyReLU
+    drop: !new:torch.nn.Dropout
+        p: 0.15
+    linear2: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation2: !new:torch.nn.LeakyReLU
+    drop2: !new:torch.nn.Dropout
+        p: 0.15
+    linear3: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation3: !new:torch.nn.LeakyReLU
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: !ref <dnn_neurons>
+   n_neurons: !ref <output_neurons_ctc>
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+model: !new:torch.nn.ModuleList
+   - [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]
+####################### Encoding & Decoding ###################################
+encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
+    compute_features: !ref <compute_features>
+    normalize: !ref <normalize>
+    CNN: !ref <CNN>
+    enc: !ref <enc>
+    back_end_ffn: !ref <back_end_ffn>
+    ctc_lin: !ref <ctc_lin>
+    log_softmax: !ref <log_softmax>
+modules:
+   encoder: !ref <encoder>
+decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
+    blank_id: !ref <blank_index>
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+# Pretrainer class
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+   loadables:
+      model: !ref <model>
+      normalize: !ref <normalize>
+      tokenizer: !ref <tokenizer>

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924e2c73e8e67ba96b2ab7259951b2f9be9821ed1124dbd1e19d485b51938a59
+size 417375920

normalize.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7f3b9f13fc0393abd277dc2a53eed7acf15460c13091a632a607c73d641385
+size 1572

tokenizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01a2686f21a89bf8fe6db37fc8d0f1e1f62551eef3702c626ae019ef618bf86d
+size 238364