asr-fon-with-diacritics / hyperparams.yaml

Upload folder using huggingface_hub

a4a6dd1 verified about 1 month ago

3.87 kB

	# ################################
	# Model: bestRQ + DNN + CTC
	# Authors: Ryan Whetten 2025
	# ################################


	####################### Model Parameters ###############################

	# Feature parameters
	sample_rate: 16000
	n_fft: 400
	n_mels: 80

	# Transformer
	d_model: 640
	nhead: 8
	num_encoder_layers: 12
	num_decoder_layers: 0
	d_ffn: 2048
	transformer_dropout: 0.1
	activation: !name:torch.nn.GELU
	output_neurons: 5000
	attention_type: RoPEMHA
	encoder_module: conformer
	dnn_activation: !new:torch.nn.LeakyReLU

	# FFNN + output
	dnn_neurons: 1024
	dnn_dropout: 0.15
	output_neurons_ctc: 60
	blank_index: 0
	bos_index: 1
	eos_index: 2

	# normalizing
	normalize: !new:speechbrain.processing.features.InputNormalization
	norm_type: sentence


	# fbanks
	compute_features: !new:speechbrain.lobes.features.Fbank
	sample_rate: !ref <sample_rate>
	n_fft: !ref <n_fft>
	n_mels: !ref <n_mels>

	############################## models ##########################################

	CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
	input_shape: (8, 10, 80)
	num_blocks: 2
	num_layers_per_block: 1
	out_channels: (128, 32)
	kernel_sizes: (5, 5)
	strides: (2, 2)
	residuals: (False, False)

	Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
	input_size: 640
	tgt_vocab: !ref <output_neurons>
	d_model: !ref <d_model>
	nhead: !ref <nhead>
	num_encoder_layers: !ref <num_encoder_layers>
	num_decoder_layers: !ref <num_decoder_layers>
	d_ffn: !ref <d_ffn>
	dropout: !ref <transformer_dropout>
	activation: !ref <activation>
	conformer_activation: !ref <activation>
	encoder_module: !ref <encoder_module>
	attention_type: !ref <attention_type>
	normalize_before: True
	causal: False

	# We must call an encoder wrapper so the decoder isn't run (we don't have any)
	enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
	transformer: !ref <Transformer>

	back_end_ffn: !new:speechbrain.nnet.containers.Sequential
	input_shape: [null, null, !ref <d_model>]
	linear1: !name:speechbrain.nnet.linear.Linear
	n_neurons: !ref <dnn_neurons>
	bias: True
	bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
	activation: !new:torch.nn.LeakyReLU
	drop: !new:torch.nn.Dropout
	p: 0.15
	linear2: !name:speechbrain.nnet.linear.Linear
	n_neurons: !ref <dnn_neurons>
	bias: True
	bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
	activation2: !new:torch.nn.LeakyReLU
	drop2: !new:torch.nn.Dropout
	p: 0.15
	linear3: !name:speechbrain.nnet.linear.Linear
	n_neurons: !ref <dnn_neurons>
	bias: True
	bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
	activation3: !new:torch.nn.LeakyReLU

	ctc_lin: !new:speechbrain.nnet.linear.Linear
	input_size: !ref <dnn_neurons>
	n_neurons: !ref <output_neurons_ctc>

	log_softmax: !new:speechbrain.nnet.activations.Softmax
	apply_log: True


	model: !new:torch.nn.ModuleList
	- [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]

	####################### Encoding & Decoding ###################################

	encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
	compute_features: !ref <compute_features>
	normalize: !ref <normalize>
	CNN: !ref <CNN>
	enc: !ref <enc>
	back_end_ffn: !ref <back_end_ffn>
	ctc_lin: !ref <ctc_lin>
	log_softmax: !ref <log_softmax>

	modules:
	encoder: !ref <encoder>

	decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
	blank_id: !ref <blank_index>

	tokenizer: !new:sentencepiece.SentencePieceProcessor

	# Pretrainer class
	pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	loadables:
	model: !ref <model>
	normalize: !ref <normalize>
	tokenizer: !ref <tokenizer>