| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import torch | 
					
					
						
						| 
							 | 
						import torch.nn as nn | 
					
					
						
						| 
							 | 
						from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						class SpeechMultimodalProjectorConfig(PretrainedConfig): | 
					
					
						
						| 
							 | 
						    """Configuration for speech multimodal projector.""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    model_type = "speech_mm_projector" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def __init__(self, speech_mm_projector_type: str = None, **kwargs): | 
					
					
						
						| 
							 | 
						        super().__init__() | 
					
					
						
						| 
							 | 
						        self.speech_mm_projector_type = speech_mm_projector_type | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						class SpeechMultimodalProjector(PreTrainedModel): | 
					
					
						
						| 
							 | 
						    """Speech multimodal projector for mapping speech features to LLM space.""" | 
					
					
						
						| 
							 | 
						    config_class = SpeechMultimodalProjectorConfig | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def __init__(self, speech_mm_projector_cfg: SpeechMultimodalProjectorConfig, config: PretrainedConfig): | 
					
					
						
						| 
							 | 
						        super().__init__(speech_mm_projector_cfg) | 
					
					
						
						| 
							 | 
						        if hasattr(config, "speech_mm_projector"): | 
					
					
						
						| 
							 | 
						            speech_mm_projector_type = config.speech_mm_projector | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            speech_mm_projector_type = speech_mm_projector_cfg.speech_mm_projector_type | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if speech_mm_projector_type == "mlp": | 
					
					
						
						| 
							 | 
						            self.layers = nn.Sequential( | 
					
					
						
						| 
							 | 
						                nn.Linear(config.speech_hidden_size, config.hidden_size), | 
					
					
						
						| 
							 | 
						                nn.GELU(), | 
					
					
						
						| 
							 | 
						                nn.Linear(config.hidden_size, config.hidden_size), | 
					
					
						
						| 
							 | 
						            ) | 
					
					
						
						| 
							 | 
						        elif speech_mm_projector_type == "mlp_downsample": | 
					
					
						
						| 
							 | 
						            self.downsample_block = AudioDownSampleBlock(config.speech_hidden_size) | 
					
					
						
						| 
							 | 
						            self.layers = nn.Sequential( | 
					
					
						
						| 
							 | 
						                nn.Linear(config.speech_hidden_size, config.hidden_size), | 
					
					
						
						| 
							 | 
						                nn.GELU(), | 
					
					
						
						| 
							 | 
						                nn.Linear(config.hidden_size, config.hidden_size), | 
					
					
						
						| 
							 | 
						            ) | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            raise ValueError(f"Unknown projector type: {speech_mm_projector_type}") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def forward(self, x, *args, **kwargs): | 
					
					
						
						| 
							 | 
						        if self.speech_mm_projector_type == "mlp_downsample": | 
					
					
						
						| 
							 | 
						            x = self.downsample_block(x) | 
					
					
						
						| 
							 | 
						        return self.layers(x) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						AutoConfig.register("speech_mm_projector", SpeechMultimodalProjectorConfig) | 
					
					
						
						| 
							 | 
						AutoModel.register(SpeechMultimodalProjectorConfig, SpeechMultimodalProjector) | 
					
					
						
						| 
							 | 
						
 |