# ################################ # Model: bestRQ + DNN + CTC # Authors: Ryan Whetten 2025 # ################################ ####################### Model Parameters ############################### # Feature parameters sample_rate: 16000 n_fft: 400 n_mels: 80 # Transformer d_model: 640 nhead: 8 num_encoder_layers: 12 num_decoder_layers: 0 d_ffn: 2048 transformer_dropout: 0.1 activation: !name:torch.nn.GELU output_neurons: 5000 attention_type: RoPEMHA encoder_module: conformer dnn_activation: !new:torch.nn.LeakyReLU # FFNN + output dnn_neurons: 1024 dnn_dropout: 0.15 output_neurons_ctc: 36 blank_index: 0 bos_index: 1 eos_index: 2 # normalizing normalize: !new:speechbrain.processing.features.InputNormalization norm_type: sentence # fbanks compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref n_fft: !ref n_mels: !ref ############################## models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) num_blocks: 2 num_layers_per_block: 1 out_channels: (128, 32) kernel_sizes: (5, 5) strides: (2, 2) residuals: (False, False) Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length input_size: 640 tgt_vocab: !ref d_model: !ref nhead: !ref num_encoder_layers: !ref num_decoder_layers: !ref d_ffn: !ref dropout: !ref activation: !ref conformer_activation: !ref encoder_module: !ref attention_type: !ref normalize_before: True causal: False # We must call an encoder wrapper so the decoder isn't run (we don't have any) enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper transformer: !ref back_end_ffn: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref ] linear1: !name:speechbrain.nnet.linear.Linear n_neurons: !ref bias: True bn1: !name:speechbrain.nnet.normalization.BatchNorm1d activation: !new:torch.nn.LeakyReLU drop: !new:torch.nn.Dropout p: 0.15 linear2: !name:speechbrain.nnet.linear.Linear n_neurons: !ref bias: True bn2: !name:speechbrain.nnet.normalization.BatchNorm1d activation2: !new:torch.nn.LeakyReLU drop2: !new:torch.nn.Dropout p: 0.15 linear3: !name:speechbrain.nnet.linear.Linear n_neurons: !ref bias: True bn3: !name:speechbrain.nnet.normalization.BatchNorm1d activation3: !new:torch.nn.LeakyReLU ctc_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True # modules: # normalize: !ref # CNN: !ref # enc: !ref # back_end_ffn: !ref # ctc_lin: !ref model: !new:torch.nn.ModuleList - [!ref , !ref , !ref , !ref ] ####################### Encoding & Decoding ################################### encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential compute_features: !ref normalize: !ref CNN: !ref enc: !ref back_end_ffn: !ref ctc_lin: !ref log_softmax: !ref modules: encoder: !ref decoding_function: !name:speechbrain.decoders.ctc_greedy_decode blank_id: !ref tokenizer: !new:sentencepiece.SentencePieceProcessor # beam_size: 100 # beam_prune_logp: -12.0 # token_prune_min_logp: -1.2 # prune_history: False # test_beam_search: # blank_index: !ref # beam_size: !ref # beam_prune_logp: !ref # token_prune_min_logp: !ref # prune_history: !ref # model_dir: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/1000_bpe.model' # text_file: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/train.txt' # vocab_size: !ref # model_type: 'bpe' # bos_id: !ref # eos_id: !ref # model_dir=hparams["save_folder"], # vocab_size=hparams["output_neurons_ctc"], # annotation_train=hparams["train_csv"], # annotation_read="wrd", # model_type=hparams["token_type"], # character_coverage=hparams["character_coverage"], # bos_id=hparams["bos_index"], # eos_id=hparams["eos_index"], # kenlm_model_path: null # # Decoding parameters # test_beam_search: # beam_size: 200 # topk: 1 # blank_index: !ref # space_token: ' ' # make sure this is the same as the one used in the tokenizer # beam_prune_logp: -10.0 # token_prune_min_logp: -5.0 # prune_history: True # alpha: 0.8 # beta: 1.2 # # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM # # It can either be a .bin or .arpa ; note: .arpa is much slower at loading # # If you don't want to use an LM, comment it out or set it to null # kenlm_model_path: !ref # Pretrainer class pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref normalize: !ref tokenizer: !ref # make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext