Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

4357.wav +0 -0
README.md +33 -0
hyperparams.yaml +191 -0
model.ckpt +3 -0
normalize.ckpt +3 -0
tokenizer.ckpt +3 -0

4357.wav ADDED Viewed

Binary file (98.4 kB). View file

README.md CHANGED Viewed

@@ -1,3 +1,36 @@
 ---
 license: apache-2.0
 ---

 ---
+language:
+- en
+thumbnail: null
+tags:
+- automatic-speech-recognition
+- CTC
+- Attention
+- Transformer
+- Conformer
+- pytorch
+- speechbrain
 license: apache-2.0
+datasets:
+- largescaleasr
+metrics:
+- wer
+- cer
 ---
+# Fongbe ASR model w/out diacritics
+```python
+from speechbrain.inference.ASR import EncoderASR
+asr_model = EncoderASR.from_hparams(
+    source="whettenr/asr-fon-without-diacritics",
+    savedir="pretrained_models/asr-fongbe-without-diacritics"
+)
+asr_model.transcribe_file("/Users/ryanwhetten/Projects/stream_asr/fon/converted_models/4357.wav")
+asr_model.transcribe_file("whettenr/asr-fon-without-diacritics/example.wav")
+# expected output:
+# huzuhuzu gɔngɔn ɖe ɖo dandan
+```

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,191 @@

+# ################################
+# Model: bestRQ + DNN + CTC
+# Authors: Ryan Whetten 2025
+# ################################
+####################### Model Parameters ###############################
+# Feature parameters
+sample_rate: 16000
+n_fft: 400
+n_mels: 80
+# Transformer
+d_model: 640
+nhead: 8
+num_encoder_layers: 12
+num_decoder_layers: 0
+d_ffn: 2048
+transformer_dropout: 0.1
+activation: !name:torch.nn.GELU
+output_neurons: 5000
+attention_type: RoPEMHA
+encoder_module: conformer
+dnn_activation: !new:torch.nn.LeakyReLU
+# FFNN + output
+dnn_neurons: 1024
+dnn_dropout: 0.15
+output_neurons_ctc: 36
+blank_index: 0
+bos_index: 1
+eos_index: 2
+# normalizing
+normalize: !new:speechbrain.processing.features.InputNormalization
+   norm_type: sentence
+# fbanks
+compute_features: !new:speechbrain.lobes.features.Fbank
+   sample_rate: !ref <sample_rate>
+   n_fft: !ref <n_fft>
+   n_mels: !ref <n_mels>
+############################## models ##########################################
+CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
+   input_shape: (8, 10, 80)
+   num_blocks: 2
+   num_layers_per_block: 1
+   out_channels: (128, 32)
+   kernel_sizes: (5, 5)
+   strides: (2, 2)
+   residuals: (False, False)
+Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+   input_size: 640
+   tgt_vocab: !ref <output_neurons>
+   d_model: !ref <d_model>
+   nhead: !ref <nhead>
+   num_encoder_layers: !ref <num_encoder_layers>
+   num_decoder_layers: !ref <num_decoder_layers>
+   d_ffn: !ref <d_ffn>
+   dropout: !ref <transformer_dropout>
+   activation: !ref <activation>
+   conformer_activation: !ref <activation>
+   encoder_module: !ref <encoder_module>
+   attention_type: !ref <attention_type>
+   normalize_before: True
+   causal: False
+# We must call an encoder wrapper so the decoder isn't run (we don't have any)
+enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
+   transformer: !ref <Transformer>
+back_end_ffn: !new:speechbrain.nnet.containers.Sequential
+    input_shape: [null, null, !ref <d_model>]
+    linear1: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation: !new:torch.nn.LeakyReLU
+    drop: !new:torch.nn.Dropout
+        p: 0.15
+    linear2: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation2: !new:torch.nn.LeakyReLU
+    drop2: !new:torch.nn.Dropout
+        p: 0.15
+    linear3: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation3: !new:torch.nn.LeakyReLU
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: !ref <dnn_neurons>
+   n_neurons: !ref <output_neurons_ctc>
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+# modules:
+#    normalize: !ref <normalize>
+#    CNN: !ref <CNN>
+#    enc: !ref <enc>
+#    back_end_ffn: !ref <back_end_ffn>
+#    ctc_lin: !ref <ctc_lin>
+model: !new:torch.nn.ModuleList
+   - [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]
+####################### Encoding & Decoding ###################################
+encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
+    compute_features: !ref <compute_features>
+    normalize: !ref <normalize>
+    CNN: !ref <CNN>
+    enc: !ref <enc>
+    back_end_ffn: !ref <back_end_ffn>
+    ctc_lin: !ref <ctc_lin>
+    log_softmax: !ref <log_softmax>
+modules:
+   encoder: !ref <encoder>
+decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
+    blank_id: !ref <blank_index>
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+# beam_size: 100
+# beam_prune_logp: -12.0
+# token_prune_min_logp: -1.2
+# prune_history: False
+# test_beam_search:
+#     blank_index: !ref <blank_index>
+#     beam_size: !ref <beam_size>
+#     beam_prune_logp: !ref <beam_prune_logp>
+#     token_prune_min_logp: !ref <token_prune_min_logp>
+#     prune_history: !ref <prune_history>
+   # model_dir: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/1000_bpe.model'
+   # text_file: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/train.txt'
+   # vocab_size: !ref <output_neurons_ctc>
+   # model_type: 'bpe'
+   # bos_id: !ref <bos_index>
+   # eos_id: !ref <eos_index>
+      #   model_dir=hparams["save_folder"],
+      #   vocab_size=hparams["output_neurons_ctc"],
+      #   annotation_train=hparams["train_csv"],
+      #   annotation_read="wrd",
+      #   model_type=hparams["token_type"],
+      #   character_coverage=hparams["character_coverage"],
+      #   bos_id=hparams["bos_index"],
+      #   eos_id=hparams["eos_index"],
+# kenlm_model_path: null
+# # Decoding parameters
+# test_beam_search:
+#    beam_size: 200
+#    topk: 1
+#    blank_index: !ref <blank_index>
+#    space_token: ' ' # make sure this is the same as the one used in the tokenizer
+#    beam_prune_logp: -10.0
+#    token_prune_min_logp: -5.0
+#    prune_history: True
+#    alpha: 0.8
+#    beta: 1.2
+#    # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
+#    # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
+#    # If you don't want to use an LM, comment it out or set it to null
+#    kenlm_model_path: !ref <kenlm_model_path>
+# Pretrainer class
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+   loadables:
+      model: !ref <model>
+      normalize: !ref <normalize>
+      tokenizer: !ref <tokenizer>
+# make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4b735565a7835818a722b76b456777923c97ee7dc41a1a2f3d5a31bb0e3c213
+size 417280073

normalize.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7f3b9f13fc0393abd277dc2a53eed7acf15460c13091a632a607c73d641385
+size 1572

tokenizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c49d678d78932ead476676ce06f685f1086cded234121ec02c8c5c99c893c64
+size 238118