whettenr commited on
Commit
a4a6dd1
·
verified ·
1 Parent(s): 8926245

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. README.md +61 -3
  2. example.wav +0 -0
  3. hyperparams.yaml +134 -0
  4. model.ckpt +3 -0
  5. normalize.ckpt +3 -0
  6. tokenizer.ckpt +3 -0
README.md CHANGED
@@ -1,3 +1,61 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - fon
4
+ thumbnail: null
5
+ tags:
6
+ - automatic-speech-recognition
7
+ - CTC
8
+ - Attention
9
+ - Transformer
10
+ - Conformer
11
+ - pytorch
12
+ - speechbrain
13
+ license: apache-2.0
14
+ datasets:
15
+ - beethogedeon/fongbe-speech
16
+ metrics:
17
+ - wer
18
+ - cer
19
+ ---
20
+
21
+ # Fongbe ASR model w/out diacritics
22
+
23
+ ### How to use for inference
24
+
25
+ ```python
26
+ from speechbrain.inference.ASR import EncoderASR
27
+
28
+ asr_model = EncoderASR.from_hparams(
29
+ source="whettenr/asr-fon-with-diacritics",
30
+ savedir="pretrained_models/asr-fongbe-with-diacritics"
31
+ )
32
+
33
+ asr_model.transcribe_file("whettenr/asr-fon-with-diacritics/example.wav")
34
+
35
+ # expected output:
36
+ # huzuhuzu gɔngɔn ɖé ɖò dandan
37
+ ```
38
+
39
+ ### Details of model
40
+ ~100M parameters, 12 layer conformer encoder, FFNN decoder
41
+
42
+ ### Details of training
43
+ - pretrained using BEST-RQ on 140 hours
44
+ - FFSTC 2 + beethogedeon/fongbe-speech (~40 hours)
45
+ - cappfm (~100 hours)
46
+ - finetuned with CTC loss on training sets of
47
+ - FFSTC 2
48
+ - beethogedeon/fongbe-speech
49
+
50
+
51
+ ```
52
+ @inproceedings{kponou25_interspeech,
53
+ title = {{Extending the Fongbe to French Speech Translation Corpus: resources, models and benchmark}},
54
+ author = {D. Fortuné Kponou and Salima Mdhaffar and Fréjus A. A. Laleye and Eugène C. Ezin and Yannick Estève},
55
+ year = {2025},
56
+ booktitle = {{Interspeech 2025}},
57
+ pages = {4533--4537},
58
+ doi = {10.21437/Interspeech.2025-1801},
59
+ issn = {2958-1796},
60
+ }
61
+ ```
example.wav ADDED
Binary file (98.4 kB). View file
 
hyperparams.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: bestRQ + DNN + CTC
3
+ # Authors: Ryan Whetten 2025
4
+ # ################################
5
+
6
+
7
+ ####################### Model Parameters ###############################
8
+
9
+ # Feature parameters
10
+ sample_rate: 16000
11
+ n_fft: 400
12
+ n_mels: 80
13
+
14
+ # Transformer
15
+ d_model: 640
16
+ nhead: 8
17
+ num_encoder_layers: 12
18
+ num_decoder_layers: 0
19
+ d_ffn: 2048
20
+ transformer_dropout: 0.1
21
+ activation: !name:torch.nn.GELU
22
+ output_neurons: 5000
23
+ attention_type: RoPEMHA
24
+ encoder_module: conformer
25
+ dnn_activation: !new:torch.nn.LeakyReLU
26
+
27
+ # FFNN + output
28
+ dnn_neurons: 1024
29
+ dnn_dropout: 0.15
30
+ output_neurons_ctc: 60
31
+ blank_index: 0
32
+ bos_index: 1
33
+ eos_index: 2
34
+
35
+ # normalizing
36
+ normalize: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: sentence
38
+
39
+
40
+ # fbanks
41
+ compute_features: !new:speechbrain.lobes.features.Fbank
42
+ sample_rate: !ref <sample_rate>
43
+ n_fft: !ref <n_fft>
44
+ n_mels: !ref <n_mels>
45
+
46
+ ############################## models ##########################################
47
+
48
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
49
+ input_shape: (8, 10, 80)
50
+ num_blocks: 2
51
+ num_layers_per_block: 1
52
+ out_channels: (128, 32)
53
+ kernel_sizes: (5, 5)
54
+ strides: (2, 2)
55
+ residuals: (False, False)
56
+
57
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
58
+ input_size: 640
59
+ tgt_vocab: !ref <output_neurons>
60
+ d_model: !ref <d_model>
61
+ nhead: !ref <nhead>
62
+ num_encoder_layers: !ref <num_encoder_layers>
63
+ num_decoder_layers: !ref <num_decoder_layers>
64
+ d_ffn: !ref <d_ffn>
65
+ dropout: !ref <transformer_dropout>
66
+ activation: !ref <activation>
67
+ conformer_activation: !ref <activation>
68
+ encoder_module: !ref <encoder_module>
69
+ attention_type: !ref <attention_type>
70
+ normalize_before: True
71
+ causal: False
72
+
73
+ # We must call an encoder wrapper so the decoder isn't run (we don't have any)
74
+ enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
75
+ transformer: !ref <Transformer>
76
+
77
+ back_end_ffn: !new:speechbrain.nnet.containers.Sequential
78
+ input_shape: [null, null, !ref <d_model>]
79
+ linear1: !name:speechbrain.nnet.linear.Linear
80
+ n_neurons: !ref <dnn_neurons>
81
+ bias: True
82
+ bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
83
+ activation: !new:torch.nn.LeakyReLU
84
+ drop: !new:torch.nn.Dropout
85
+ p: 0.15
86
+ linear2: !name:speechbrain.nnet.linear.Linear
87
+ n_neurons: !ref <dnn_neurons>
88
+ bias: True
89
+ bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
90
+ activation2: !new:torch.nn.LeakyReLU
91
+ drop2: !new:torch.nn.Dropout
92
+ p: 0.15
93
+ linear3: !name:speechbrain.nnet.linear.Linear
94
+ n_neurons: !ref <dnn_neurons>
95
+ bias: True
96
+ bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
97
+ activation3: !new:torch.nn.LeakyReLU
98
+
99
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
100
+ input_size: !ref <dnn_neurons>
101
+ n_neurons: !ref <output_neurons_ctc>
102
+
103
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
104
+ apply_log: True
105
+
106
+
107
+ model: !new:torch.nn.ModuleList
108
+ - [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]
109
+
110
+ ####################### Encoding & Decoding ###################################
111
+
112
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
113
+ compute_features: !ref <compute_features>
114
+ normalize: !ref <normalize>
115
+ CNN: !ref <CNN>
116
+ enc: !ref <enc>
117
+ back_end_ffn: !ref <back_end_ffn>
118
+ ctc_lin: !ref <ctc_lin>
119
+ log_softmax: !ref <log_softmax>
120
+
121
+ modules:
122
+ encoder: !ref <encoder>
123
+
124
+ decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
125
+ blank_id: !ref <blank_index>
126
+
127
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
128
+
129
+ # Pretrainer class
130
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
131
+ loadables:
132
+ model: !ref <model>
133
+ normalize: !ref <normalize>
134
+ tokenizer: !ref <tokenizer>
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924e2c73e8e67ba96b2ab7259951b2f9be9821ed1124dbd1e19d485b51938a59
3
+ size 417375920
normalize.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd7f3b9f13fc0393abd277dc2a53eed7acf15460c13091a632a607c73d641385
3
+ size 1572
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01a2686f21a89bf8fe6db37fc8d0f1e1f62551eef3702c626ae019ef618bf86d
3
+ size 238364