File size: 3,871 Bytes

a4a6dd1

# ################################
# Model: bestRQ + DNN + CTC
# Authors: Ryan Whetten 2025
# ################################


####################### Model Parameters ###############################

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80

# Transformer
d_model: 640
nhead: 8
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
attention_type: RoPEMHA
encoder_module: conformer
dnn_activation: !new:torch.nn.LeakyReLU

# FFNN + output
dnn_neurons: 1024
dnn_dropout: 0.15
output_neurons_ctc: 60
blank_index: 0
bos_index: 1
eos_index: 2

# normalizing
normalize: !new:speechbrain.processing.features.InputNormalization
   norm_type: sentence


# fbanks
compute_features: !new:speechbrain.lobes.features.Fbank
   sample_rate: !ref <sample_rate>
   n_fft: !ref <n_fft>
   n_mels: !ref <n_mels>

############################## models ##########################################

CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
   input_shape: (8, 10, 80)
   num_blocks: 2
   num_layers_per_block: 1
   out_channels: (128, 32)
   kernel_sizes: (5, 5)
   strides: (2, 2)
   residuals: (False, False)

Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
   input_size: 640
   tgt_vocab: !ref <output_neurons>
   d_model: !ref <d_model>
   nhead: !ref <nhead>
   num_encoder_layers: !ref <num_encoder_layers>
   num_decoder_layers: !ref <num_decoder_layers>
   d_ffn: !ref <d_ffn>
   dropout: !ref <transformer_dropout>
   activation: !ref <activation>
   conformer_activation: !ref <activation>
   encoder_module: !ref <encoder_module>
   attention_type: !ref <attention_type>
   normalize_before: True
   causal: False

# We must call an encoder wrapper so the decoder isn't run (we don't have any)
enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
   transformer: !ref <Transformer>

back_end_ffn: !new:speechbrain.nnet.containers.Sequential
    input_shape: [null, null, !ref <d_model>]
    linear1: !name:speechbrain.nnet.linear.Linear
        n_neurons: !ref <dnn_neurons>
        bias: True
    bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
    activation: !new:torch.nn.LeakyReLU
    drop: !new:torch.nn.Dropout
        p: 0.15
    linear2: !name:speechbrain.nnet.linear.Linear
        n_neurons: !ref <dnn_neurons>
        bias: True
    bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
    activation2: !new:torch.nn.LeakyReLU
    drop2: !new:torch.nn.Dropout
        p: 0.15
    linear3: !name:speechbrain.nnet.linear.Linear
        n_neurons: !ref <dnn_neurons>
        bias: True
    bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
    activation3: !new:torch.nn.LeakyReLU

ctc_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dnn_neurons>
   n_neurons: !ref <output_neurons_ctc>

log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True


model: !new:torch.nn.ModuleList
   - [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]

####################### Encoding & Decoding ###################################

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    compute_features: !ref <compute_features>
    normalize: !ref <normalize>
    CNN: !ref <CNN>
    enc: !ref <enc>
    back_end_ffn: !ref <back_end_ffn>
    ctc_lin: !ref <ctc_lin>
    log_softmax: !ref <log_softmax>

modules:
   encoder: !ref <encoder>

decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
    blank_id: !ref <blank_index>

tokenizer: !new:sentencepiece.SentencePieceProcessor

# Pretrainer class
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
      model: !ref <model>
      normalize: !ref <normalize>
      tokenizer: !ref <tokenizer>