File size: 3,871 Bytes
a4a6dd1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# ################################
# Model: bestRQ + DNN + CTC
# Authors: Ryan Whetten 2025
# ################################
####################### Model Parameters ###############################
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
# Transformer
d_model: 640
nhead: 8
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
attention_type: RoPEMHA
encoder_module: conformer
dnn_activation: !new:torch.nn.LeakyReLU
# FFNN + output
dnn_neurons: 1024
dnn_dropout: 0.15
output_neurons_ctc: 60
blank_index: 0
bos_index: 1
eos_index: 2
# normalizing
normalize: !new:speechbrain.processing.features.InputNormalization
norm_type: sentence
# fbanks
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
############################## models ##########################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (128, 32)
kernel_sizes: (5, 5)
strides: (2, 2)
residuals: (False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: 640
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
conformer_activation: !ref <activation>
encoder_module: !ref <encoder_module>
attention_type: !ref <attention_type>
normalize_before: True
causal: False
# We must call an encoder wrapper so the decoder isn't run (we don't have any)
enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
back_end_ffn: !new:speechbrain.nnet.containers.Sequential
input_shape: [null, null, !ref <d_model>]
linear1: !name:speechbrain.nnet.linear.Linear
n_neurons: !ref <dnn_neurons>
bias: True
bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
activation: !new:torch.nn.LeakyReLU
drop: !new:torch.nn.Dropout
p: 0.15
linear2: !name:speechbrain.nnet.linear.Linear
n_neurons: !ref <dnn_neurons>
bias: True
bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
activation2: !new:torch.nn.LeakyReLU
drop2: !new:torch.nn.Dropout
p: 0.15
linear3: !name:speechbrain.nnet.linear.Linear
n_neurons: !ref <dnn_neurons>
bias: True
bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
activation3: !new:torch.nn.LeakyReLU
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dnn_neurons>
n_neurons: !ref <output_neurons_ctc>
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True
model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]
####################### Encoding & Decoding ###################################
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
compute_features: !ref <compute_features>
normalize: !ref <normalize>
CNN: !ref <CNN>
enc: !ref <enc>
back_end_ffn: !ref <back_end_ffn>
ctc_lin: !ref <ctc_lin>
log_softmax: !ref <log_softmax>
modules:
encoder: !ref <encoder>
decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
blank_id: !ref <blank_index>
tokenizer: !new:sentencepiece.SentencePieceProcessor
# Pretrainer class
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
normalize: !ref <normalize>
tokenizer: !ref <tokenizer>
|