|  | from dataclasses import dataclass | 
					
						
						|  | from transformers import PretrainedConfig | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class OrcaleSeekConfig: | 
					
						
						|  | model_type: str = "orcaleseek" | 
					
						
						|  | vocab_size: int = 50257 | 
					
						
						|  | n_embd: int = 768 | 
					
						
						|  | n_layer: int = 12 | 
					
						
						|  | n_head: int = 12 | 
					
						
						|  | n_inner: int = 3072 | 
					
						
						|  | activation_function: str = "gelu_new" | 
					
						
						|  | resid_pdrop: float = 0.1 | 
					
						
						|  | embd_pdrop: float = 0.1 | 
					
						
						|  | attn_pdrop: float = 0.1 | 
					
						
						|  | layer_norm_epsilon: float = 1e-5 | 
					
						
						|  | initializer_range: float = 0.02 | 
					
						
						|  | scale_attn_weights: bool = True | 
					
						
						|  | use_cache: bool = True | 
					
						
						|  | bos_token_id: int = 50256 | 
					
						
						|  | eos_token_id: int = 50256 | 
					
						
						|  | architectures = ["OrcaleSeekForCausalLM"] | 
					
						
						|  |  | 
					
						
						|  | def to_hf_config(self): | 
					
						
						|  | return PretrainedConfig(**self.__dict__) |