whettenr commited on
Commit
93a0f4f
·
verified ·
1 Parent(s): 9cd3827

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. 4357.wav +0 -0
  2. README.md +33 -0
  3. hyperparams.yaml +191 -0
  4. model.ckpt +3 -0
  5. normalize.ckpt +3 -0
  6. tokenizer.ckpt +3 -0
4357.wav ADDED
Binary file (98.4 kB). View file
 
README.md CHANGED
@@ -1,3 +1,36 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
2
  license: apache-2.0
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - en
4
+ thumbnail: null
5
+ tags:
6
+ - automatic-speech-recognition
7
+ - CTC
8
+ - Attention
9
+ - Transformer
10
+ - Conformer
11
+ - pytorch
12
+ - speechbrain
13
  license: apache-2.0
14
+ datasets:
15
+ - largescaleasr
16
+ metrics:
17
+ - wer
18
+ - cer
19
  ---
20
+
21
+ # Fongbe ASR model w/out diacritics
22
+
23
+ ```python
24
+ from speechbrain.inference.ASR import EncoderASR
25
+
26
+ asr_model = EncoderASR.from_hparams(
27
+ source="whettenr/asr-fon-without-diacritics",
28
+ savedir="pretrained_models/asr-fongbe-without-diacritics"
29
+ )
30
+
31
+ asr_model.transcribe_file("/Users/ryanwhetten/Projects/stream_asr/fon/converted_models/4357.wav")
32
+ asr_model.transcribe_file("whettenr/asr-fon-without-diacritics/example.wav")
33
+
34
+ # expected output:
35
+ # huzuhuzu gɔngɔn ɖe ɖo dandan
36
+ ```
hyperparams.yaml ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: bestRQ + DNN + CTC
3
+ # Authors: Ryan Whetten 2025
4
+ # ################################
5
+
6
+
7
+ ####################### Model Parameters ###############################
8
+
9
+ # Feature parameters
10
+ sample_rate: 16000
11
+ n_fft: 400
12
+ n_mels: 80
13
+
14
+ # Transformer
15
+ d_model: 640
16
+ nhead: 8
17
+ num_encoder_layers: 12
18
+ num_decoder_layers: 0
19
+ d_ffn: 2048
20
+ transformer_dropout: 0.1
21
+ activation: !name:torch.nn.GELU
22
+ output_neurons: 5000
23
+ attention_type: RoPEMHA
24
+ encoder_module: conformer
25
+ dnn_activation: !new:torch.nn.LeakyReLU
26
+
27
+ # FFNN + output
28
+ dnn_neurons: 1024
29
+ dnn_dropout: 0.15
30
+ output_neurons_ctc: 36
31
+ blank_index: 0
32
+ bos_index: 1
33
+ eos_index: 2
34
+
35
+ # normalizing
36
+ normalize: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: sentence
38
+
39
+
40
+ # fbanks
41
+ compute_features: !new:speechbrain.lobes.features.Fbank
42
+ sample_rate: !ref <sample_rate>
43
+ n_fft: !ref <n_fft>
44
+ n_mels: !ref <n_mels>
45
+
46
+ ############################## models ##########################################
47
+
48
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
49
+ input_shape: (8, 10, 80)
50
+ num_blocks: 2
51
+ num_layers_per_block: 1
52
+ out_channels: (128, 32)
53
+ kernel_sizes: (5, 5)
54
+ strides: (2, 2)
55
+ residuals: (False, False)
56
+
57
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
58
+ input_size: 640
59
+ tgt_vocab: !ref <output_neurons>
60
+ d_model: !ref <d_model>
61
+ nhead: !ref <nhead>
62
+ num_encoder_layers: !ref <num_encoder_layers>
63
+ num_decoder_layers: !ref <num_decoder_layers>
64
+ d_ffn: !ref <d_ffn>
65
+ dropout: !ref <transformer_dropout>
66
+ activation: !ref <activation>
67
+ conformer_activation: !ref <activation>
68
+ encoder_module: !ref <encoder_module>
69
+ attention_type: !ref <attention_type>
70
+ normalize_before: True
71
+ causal: False
72
+
73
+ # We must call an encoder wrapper so the decoder isn't run (we don't have any)
74
+ enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
75
+ transformer: !ref <Transformer>
76
+
77
+ back_end_ffn: !new:speechbrain.nnet.containers.Sequential
78
+ input_shape: [null, null, !ref <d_model>]
79
+ linear1: !name:speechbrain.nnet.linear.Linear
80
+ n_neurons: !ref <dnn_neurons>
81
+ bias: True
82
+ bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
83
+ activation: !new:torch.nn.LeakyReLU
84
+ drop: !new:torch.nn.Dropout
85
+ p: 0.15
86
+ linear2: !name:speechbrain.nnet.linear.Linear
87
+ n_neurons: !ref <dnn_neurons>
88
+ bias: True
89
+ bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
90
+ activation2: !new:torch.nn.LeakyReLU
91
+ drop2: !new:torch.nn.Dropout
92
+ p: 0.15
93
+ linear3: !name:speechbrain.nnet.linear.Linear
94
+ n_neurons: !ref <dnn_neurons>
95
+ bias: True
96
+ bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
97
+ activation3: !new:torch.nn.LeakyReLU
98
+
99
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
100
+ input_size: !ref <dnn_neurons>
101
+ n_neurons: !ref <output_neurons_ctc>
102
+
103
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
104
+ apply_log: True
105
+
106
+
107
+ # modules:
108
+ # normalize: !ref <normalize>
109
+ # CNN: !ref <CNN>
110
+ # enc: !ref <enc>
111
+ # back_end_ffn: !ref <back_end_ffn>
112
+ # ctc_lin: !ref <ctc_lin>
113
+
114
+ model: !new:torch.nn.ModuleList
115
+ - [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]
116
+
117
+ ####################### Encoding & Decoding ###################################
118
+
119
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
120
+ compute_features: !ref <compute_features>
121
+ normalize: !ref <normalize>
122
+ CNN: !ref <CNN>
123
+ enc: !ref <enc>
124
+ back_end_ffn: !ref <back_end_ffn>
125
+ ctc_lin: !ref <ctc_lin>
126
+ log_softmax: !ref <log_softmax>
127
+
128
+ modules:
129
+ encoder: !ref <encoder>
130
+
131
+ decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
132
+ blank_id: !ref <blank_index>
133
+
134
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
135
+
136
+
137
+ # beam_size: 100
138
+ # beam_prune_logp: -12.0
139
+ # token_prune_min_logp: -1.2
140
+ # prune_history: False
141
+
142
+ # test_beam_search:
143
+ # blank_index: !ref <blank_index>
144
+ # beam_size: !ref <beam_size>
145
+ # beam_prune_logp: !ref <beam_prune_logp>
146
+ # token_prune_min_logp: !ref <token_prune_min_logp>
147
+ # prune_history: !ref <prune_history>
148
+
149
+ # model_dir: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/1000_bpe.model'
150
+ # text_file: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/train.txt'
151
+ # vocab_size: !ref <output_neurons_ctc>
152
+ # model_type: 'bpe'
153
+ # bos_id: !ref <bos_index>
154
+ # eos_id: !ref <eos_index>
155
+
156
+
157
+ # model_dir=hparams["save_folder"],
158
+ # vocab_size=hparams["output_neurons_ctc"],
159
+ # annotation_train=hparams["train_csv"],
160
+ # annotation_read="wrd",
161
+ # model_type=hparams["token_type"],
162
+ # character_coverage=hparams["character_coverage"],
163
+ # bos_id=hparams["bos_index"],
164
+ # eos_id=hparams["eos_index"],
165
+
166
+ # kenlm_model_path: null
167
+
168
+ # # Decoding parameters
169
+ # test_beam_search:
170
+ # beam_size: 200
171
+ # topk: 1
172
+ # blank_index: !ref <blank_index>
173
+ # space_token: ' ' # make sure this is the same as the one used in the tokenizer
174
+ # beam_prune_logp: -10.0
175
+ # token_prune_min_logp: -5.0
176
+ # prune_history: True
177
+ # alpha: 0.8
178
+ # beta: 1.2
179
+ # # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
180
+ # # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
181
+ # # If you don't want to use an LM, comment it out or set it to null
182
+ # kenlm_model_path: !ref <kenlm_model_path>
183
+
184
+ # Pretrainer class
185
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
186
+ loadables:
187
+ model: !ref <model>
188
+ normalize: !ref <normalize>
189
+ tokenizer: !ref <tokenizer>
190
+
191
+ # make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4b735565a7835818a722b76b456777923c97ee7dc41a1a2f3d5a31bb0e3c213
3
+ size 417280073
normalize.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd7f3b9f13fc0393abd277dc2a53eed7acf15460c13091a632a607c73d641385
3
+ size 1572
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c49d678d78932ead476676ce06f685f1086cded234121ec02c8c5c99c893c64
3
+ size 238118