File size: 3,871 Bytes
a4a6dd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# ################################
# Model: bestRQ + DNN + CTC
# Authors: Ryan Whetten 2025
# ################################


####################### Model Parameters ###############################

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80

# Transformer
d_model: 640
nhead: 8
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
attention_type: RoPEMHA
encoder_module: conformer
dnn_activation: !new:torch.nn.LeakyReLU

# FFNN + output
dnn_neurons: 1024
dnn_dropout: 0.15
output_neurons_ctc: 60
blank_index: 0
bos_index: 1
eos_index: 2

# normalizing
normalize: !new:speechbrain.processing.features.InputNormalization
   norm_type: sentence


# fbanks
compute_features: !new:speechbrain.lobes.features.Fbank
   sample_rate: !ref <sample_rate>
   n_fft: !ref <n_fft>
   n_mels: !ref <n_mels>

############################## models ##########################################

CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
   input_shape: (8, 10, 80)
   num_blocks: 2
   num_layers_per_block: 1
   out_channels: (128, 32)
   kernel_sizes: (5, 5)
   strides: (2, 2)
   residuals: (False, False)

Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
   input_size: 640
   tgt_vocab: !ref <output_neurons>
   d_model: !ref <d_model>
   nhead: !ref <nhead>
   num_encoder_layers: !ref <num_encoder_layers>
   num_decoder_layers: !ref <num_decoder_layers>
   d_ffn: !ref <d_ffn>
   dropout: !ref <transformer_dropout>
   activation: !ref <activation>
   conformer_activation: !ref <activation>
   encoder_module: !ref <encoder_module>
   attention_type: !ref <attention_type>
   normalize_before: True
   causal: False

# We must call an encoder wrapper so the decoder isn't run (we don't have any)
enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
   transformer: !ref <Transformer>

back_end_ffn: !new:speechbrain.nnet.containers.Sequential
    input_shape: [null, null, !ref <d_model>]
    linear1: !name:speechbrain.nnet.linear.Linear
        n_neurons: !ref <dnn_neurons>
        bias: True
    bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
    activation: !new:torch.nn.LeakyReLU
    drop: !new:torch.nn.Dropout
        p: 0.15
    linear2: !name:speechbrain.nnet.linear.Linear
        n_neurons: !ref <dnn_neurons>
        bias: True
    bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
    activation2: !new:torch.nn.LeakyReLU
    drop2: !new:torch.nn.Dropout
        p: 0.15
    linear3: !name:speechbrain.nnet.linear.Linear
        n_neurons: !ref <dnn_neurons>
        bias: True
    bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
    activation3: !new:torch.nn.LeakyReLU

ctc_lin: !new:speechbrain.nnet.linear.Linear
   input_size: !ref <dnn_neurons>
   n_neurons: !ref <output_neurons_ctc>

log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: True


model: !new:torch.nn.ModuleList
   - [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]

####################### Encoding & Decoding ###################################

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    compute_features: !ref <compute_features>
    normalize: !ref <normalize>
    CNN: !ref <CNN>
    enc: !ref <enc>
    back_end_ffn: !ref <back_end_ffn>
    ctc_lin: !ref <ctc_lin>
    log_softmax: !ref <log_softmax>

modules:
   encoder: !ref <encoder>

decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
    blank_id: !ref <blank_index>

tokenizer: !new:sentencepiece.SentencePieceProcessor

# Pretrainer class
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
      model: !ref <model>
      normalize: !ref <normalize>
      tokenizer: !ref <tokenizer>