asr-fon-without-diacritics / hyperparams.yaml

Upload folder using huggingface_hub

93a0f4f verified about 1 month ago

5.77 kB

	# ################################
	# Model: bestRQ + DNN + CTC
	# Authors: Ryan Whetten 2025
	# ################################


	####################### Model Parameters ###############################

	# Feature parameters
	sample_rate: 16000
	n_fft: 400
	n_mels: 80

	# Transformer
	d_model: 640
	nhead: 8
	num_encoder_layers: 12
	num_decoder_layers: 0
	d_ffn: 2048
	transformer_dropout: 0.1
	activation: !name:torch.nn.GELU
	output_neurons: 5000
	attention_type: RoPEMHA
	encoder_module: conformer
	dnn_activation: !new:torch.nn.LeakyReLU

	# FFNN + output
	dnn_neurons: 1024
	dnn_dropout: 0.15
	output_neurons_ctc: 36
	blank_index: 0
	bos_index: 1
	eos_index: 2

	# normalizing
	normalize: !new:speechbrain.processing.features.InputNormalization
	norm_type: sentence


	# fbanks
	compute_features: !new:speechbrain.lobes.features.Fbank
	sample_rate: !ref <sample_rate>
	n_fft: !ref <n_fft>
	n_mels: !ref <n_mels>

	############################## models ##########################################

	CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
	input_shape: (8, 10, 80)
	num_blocks: 2
	num_layers_per_block: 1
	out_channels: (128, 32)
	kernel_sizes: (5, 5)
	strides: (2, 2)
	residuals: (False, False)

	Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
	input_size: 640
	tgt_vocab: !ref <output_neurons>
	d_model: !ref <d_model>
	nhead: !ref <nhead>
	num_encoder_layers: !ref <num_encoder_layers>
	num_decoder_layers: !ref <num_decoder_layers>
	d_ffn: !ref <d_ffn>
	dropout: !ref <transformer_dropout>
	activation: !ref <activation>
	conformer_activation: !ref <activation>
	encoder_module: !ref <encoder_module>
	attention_type: !ref <attention_type>
	normalize_before: True
	causal: False

	# We must call an encoder wrapper so the decoder isn't run (we don't have any)
	enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
	transformer: !ref <Transformer>

	back_end_ffn: !new:speechbrain.nnet.containers.Sequential
	input_shape: [null, null, !ref <d_model>]
	linear1: !name:speechbrain.nnet.linear.Linear
	n_neurons: !ref <dnn_neurons>
	bias: True
	bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
	activation: !new:torch.nn.LeakyReLU
	drop: !new:torch.nn.Dropout
	p: 0.15
	linear2: !name:speechbrain.nnet.linear.Linear
	n_neurons: !ref <dnn_neurons>
	bias: True
	bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
	activation2: !new:torch.nn.LeakyReLU
	drop2: !new:torch.nn.Dropout
	p: 0.15
	linear3: !name:speechbrain.nnet.linear.Linear
	n_neurons: !ref <dnn_neurons>
	bias: True
	bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
	activation3: !new:torch.nn.LeakyReLU

	ctc_lin: !new:speechbrain.nnet.linear.Linear
	input_size: !ref <dnn_neurons>
	n_neurons: !ref <output_neurons_ctc>

	log_softmax: !new:speechbrain.nnet.activations.Softmax
	apply_log: True


	# modules:
	# normalize: !ref <normalize>
	# CNN: !ref <CNN>
	# enc: !ref <enc>
	# back_end_ffn: !ref <back_end_ffn>
	# ctc_lin: !ref <ctc_lin>

	model: !new:torch.nn.ModuleList
	- [!ref <CNN>, !ref <enc>, !ref <back_end_ffn>, !ref <ctc_lin>]

	####################### Encoding & Decoding ###################################

	encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
	compute_features: !ref <compute_features>
	normalize: !ref <normalize>
	CNN: !ref <CNN>
	enc: !ref <enc>
	back_end_ffn: !ref <back_end_ffn>
	ctc_lin: !ref <ctc_lin>
	log_softmax: !ref <log_softmax>

	modules:
	encoder: !ref <encoder>

	decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
	blank_id: !ref <blank_index>

	tokenizer: !new:sentencepiece.SentencePieceProcessor


	# beam_size: 100
	# beam_prune_logp: -12.0
	# token_prune_min_logp: -1.2
	# prune_history: False

	# test_beam_search:
	# blank_index: !ref <blank_index>
	# beam_size: !ref <beam_size>
	# beam_prune_logp: !ref <beam_prune_logp>
	# token_prune_min_logp: !ref <token_prune_min_logp>
	# prune_history: !ref <prune_history>

	# model_dir: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/1000_bpe.model'
	# text_file: '/Users/ryanwhetten/Projects/stream_asr/models/brq_ls_960/train.txt'
	# vocab_size: !ref <output_neurons_ctc>
	# model_type: 'bpe'
	# bos_id: !ref <bos_index>
	# eos_id: !ref <eos_index>


	# model_dir=hparams["save_folder"],
	# vocab_size=hparams["output_neurons_ctc"],
	# annotation_train=hparams["train_csv"],
	# annotation_read="wrd",
	# model_type=hparams["token_type"],
	# character_coverage=hparams["character_coverage"],
	# bos_id=hparams["bos_index"],
	# eos_id=hparams["eos_index"],

	# kenlm_model_path: null

	# # Decoding parameters
	# test_beam_search:
	# beam_size: 200
	# topk: 1
	# blank_index: !ref <blank_index>
	# space_token: ' ' # make sure this is the same as the one used in the tokenizer
	# beam_prune_logp: -10.0
	# token_prune_min_logp: -5.0
	# prune_history: True
	# alpha: 0.8
	# beta: 1.2
	# # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
	# # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
	# # If you don't want to use an LM, comment it out or set it to null
	# kenlm_model_path: !ref <kenlm_model_path>

	# Pretrainer class
	pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	loadables:
	model: !ref <model>
	normalize: !ref <normalize>
	tokenizer: !ref <tokenizer>

	# make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext