nvidia
/

canary-qwen-2.5b

Automatic Speech Recognition

hf-asr-leaderboard

Model card Files Files and versions

canary-qwen-2.5b / config.json

piotrzelasko's picture

Upload folder using huggingface_hub

a35ed36 verified 4 months ago

history blame contribute delete

2.38 kB

	{
	"audio_locator_tag": "<\|audioplaceholder\|>",
	"freeze_params": [
	"^llm\\..+$",
	"^embed_tokens\\..+$"
	],
	"lora": {
	"lora_alpha": 256,
	"lora_dropout": 0.01,
	"r": 128,
	"target_modules": [
	"q_proj",
	"v_proj"
	],
	"task_type": "CAUSAL_LM"
	},
	"lr_scheduler": {
	"_target_": "nemo.core.optim.lr_scheduler.CosineAnnealing",
	"max_steps": 100000,
	"min_lr": 1e-06,
	"warmup_steps": 1000
	},
	"optimizer": {
	"_target_": "torch.optim.AdamW",
	"betas": [
	0.9,
	0.98
	],
	"foreach": true,
	"lr": 0.0005,
	"weight_decay": 0.001
	},
	"perception": {
	"encoder": {
	"_target_": "nemo.collections.asr.modules.ConformerEncoder",
	"att_context_size": [
	-1,
	-1
	],
	"causal_downsampling": false,
	"conv_context_size": null,
	"conv_kernel_size": 9,
	"conv_norm_type": "batch_norm",
	"d_model": 1024,
	"dropout": 0.1,
	"dropout_att": 0.1,
	"dropout_emb": 0.0,
	"dropout_pre_encoder": 0.1,
	"feat_in": 128,
	"feat_out": -1,
	"ff_expansion_factor": 4,
	"n_heads": 8,
	"n_layers": 32,
	"pos_emb_max_len": 5000,
	"reduction": null,
	"reduction_factor": 1,
	"reduction_position": null,
	"self_attention_model": "rel_pos",
	"subsampling": "dw_striding",
	"subsampling_conv_channels": 256,
	"subsampling_factor": 8,
	"untie_biases": true,
	"xscaling": false
	},
	"modality_adapter": {
	"_target_": "nemo.collections.speechlm2.modules.perception.IdentityConnector",
	"d_model": 1024
	},
	"output_dim": 2048,
	"preprocessor": {
	"_target_": "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor",
	"dither": 1e-05,
	"features": 128,
	"frame_splicing": 1,
	"log": true,
	"n_fft": 512,
	"normalize": "per_feature",
	"pad_to": 0,
	"pad_value": 0.0,
	"sample_rate": 16000,
	"window": "hann",
	"window_size": 0.025,
	"window_stride": 0.01
	},
	"target": "nemo.collections.speechlm2.modules.perception.AudioPerceptionModule"
	},
	"pretrained_asr": "nvidia/canary-1b-flash",
	"pretrained_llm": "Qwen/Qwen3-1.7B",
	"pretrained_weights": false,
	"prevent_freeze_params": [
	"^.+\\.lora_.+$"
	],
	"prompt_format": "qwen",
	"torch_dtype": "bfloat16"
	}