Update models
Browse files- asr.ckpt +2 -2
- hyperparams.yaml +16 -19
- lm.ckpt +2 -2
- normalizer.ckpt +2 -2
- tokenizer.ckpt +2 -2
asr.ckpt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e718dc29b403dfaa8d2604c43c3666be3fa99e958b77e3c6ff387e94d4a174c
|
| 3 |
+
size 184546287
|
hyperparams.yaml
CHANGED
|
@@ -29,7 +29,6 @@ vocab_size: 5000
|
|
| 29 |
|
| 30 |
# Outputs
|
| 31 |
blank_index: 0
|
| 32 |
-
label_smoothing: 0.1
|
| 33 |
pad_index: 0
|
| 34 |
bos_index: 1
|
| 35 |
eos_index: 2
|
|
@@ -38,10 +37,8 @@ unk_index: 0
|
|
| 38 |
# Decoding parameters
|
| 39 |
min_decode_ratio: 0.0
|
| 40 |
max_decode_ratio: 1.0
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
test_beam_size: 60
|
| 44 |
-
lm_weight: 0.20
|
| 45 |
ctc_weight_decode: 0.40
|
| 46 |
|
| 47 |
############################## models ################################
|
|
@@ -51,15 +48,15 @@ normalizer: !new:speechbrain.processing.features.InputNormalization
|
|
| 51 |
|
| 52 |
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
|
| 53 |
input_shape: (8, 10, 80)
|
| 54 |
-
num_blocks:
|
| 55 |
num_layers_per_block: 1
|
| 56 |
-
out_channels: (64,
|
| 57 |
-
kernel_sizes: (
|
| 58 |
-
strides: (2, 2)
|
| 59 |
-
residuals: (False, False)
|
| 60 |
-
|
| 61 |
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
|
| 62 |
-
input_size:
|
| 63 |
tgt_vocab: !ref <output_neurons>
|
| 64 |
d_model: !ref <d_model>
|
| 65 |
nhead: !ref <nhead>
|
|
@@ -106,11 +103,14 @@ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
|
|
| 106 |
ctc_weight: !ref <ctc_weight_decode>
|
| 107 |
lm_weight: !ref <lm_weight>
|
| 108 |
lm_modules: !ref <lm_model>
|
| 109 |
-
temperature: 1.
|
| 110 |
-
temperature_lm: 1.
|
| 111 |
using_eos_threshold: False
|
| 112 |
length_normalization: True
|
| 113 |
|
|
|
|
|
|
|
|
|
|
| 114 |
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
|
| 115 |
transformer: !ref <Transformer>
|
| 116 |
|
|
@@ -122,11 +122,7 @@ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
|
|
| 122 |
transformer_encoder: !ref <Tencoder>
|
| 123 |
|
| 124 |
asr_model: !new:torch.nn.ModuleList
|
| 125 |
-
- [!ref <
|
| 126 |
-
|
| 127 |
-
log_softmax: !new:torch.nn.LogSoftmax
|
| 128 |
-
dim: -1
|
| 129 |
-
|
| 130 |
|
| 131 |
compute_features: !new:speechbrain.lobes.features.Fbank
|
| 132 |
sample_rate: !ref <sample_rate>
|
|
@@ -142,6 +138,7 @@ modules:
|
|
| 142 |
lm_model: !ref <lm_model>
|
| 143 |
encoder: !ref <encoder>
|
| 144 |
decoder: !ref <decoder>
|
|
|
|
| 145 |
# The pretrainer allows a mapping between pretrained files and instances that
|
| 146 |
# are declared in the yaml.
|
| 147 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
|
|
|
| 29 |
|
| 30 |
# Outputs
|
| 31 |
blank_index: 0
|
|
|
|
| 32 |
pad_index: 0
|
| 33 |
bos_index: 1
|
| 34 |
eos_index: 2
|
|
|
|
| 37 |
# Decoding parameters
|
| 38 |
min_decode_ratio: 0.0
|
| 39 |
max_decode_ratio: 1.0
|
| 40 |
+
test_beam_size: 10
|
| 41 |
+
lm_weight: 0.0
|
|
|
|
|
|
|
| 42 |
ctc_weight_decode: 0.40
|
| 43 |
|
| 44 |
############################## models ################################
|
|
|
|
| 48 |
|
| 49 |
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
|
| 50 |
input_shape: (8, 10, 80)
|
| 51 |
+
num_blocks: 3
|
| 52 |
num_layers_per_block: 1
|
| 53 |
+
out_channels: (64, 64, 64)
|
| 54 |
+
kernel_sizes: (5, 5, 1)
|
| 55 |
+
strides: (2, 2, 1)
|
| 56 |
+
residuals: (False, False, True)
|
| 57 |
+
|
| 58 |
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
|
| 59 |
+
input_size: 1280
|
| 60 |
tgt_vocab: !ref <output_neurons>
|
| 61 |
d_model: !ref <d_model>
|
| 62 |
nhead: !ref <nhead>
|
|
|
|
| 103 |
ctc_weight: !ref <ctc_weight_decode>
|
| 104 |
lm_weight: !ref <lm_weight>
|
| 105 |
lm_modules: !ref <lm_model>
|
| 106 |
+
temperature: 1.30
|
| 107 |
+
temperature_lm: 1.30
|
| 108 |
using_eos_threshold: False
|
| 109 |
length_normalization: True
|
| 110 |
|
| 111 |
+
log_softmax: !new:torch.nn.LogSoftmax
|
| 112 |
+
dim: -1
|
| 113 |
+
|
| 114 |
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
|
| 115 |
transformer: !ref <Transformer>
|
| 116 |
|
|
|
|
| 122 |
transformer_encoder: !ref <Tencoder>
|
| 123 |
|
| 124 |
asr_model: !new:torch.nn.ModuleList
|
| 125 |
+
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
compute_features: !new:speechbrain.lobes.features.Fbank
|
| 128 |
sample_rate: !ref <sample_rate>
|
|
|
|
| 138 |
lm_model: !ref <lm_model>
|
| 139 |
encoder: !ref <encoder>
|
| 140 |
decoder: !ref <decoder>
|
| 141 |
+
|
| 142 |
# The pretrainer allows a mapping between pretrained files and instances that
|
| 143 |
# are declared in the yaml.
|
| 144 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
lm.ckpt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f0b49d5e1f9894c0c9f2ec21c8658da8e1a07f509b807e8624450ba19ea667c
|
| 3 |
+
size 381072461
|
normalizer.ckpt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1da2ced935d955c014177591249e5db497d0c5dc7143e64378da0cb5590fe77a
|
| 3 |
+
size 1703
|
tokenizer.ckpt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d419e55734c26df6c5690671be2b887a7db389c1a7f63286111ce737508c6569
|
| 3 |
+
size 313900
|