File size: 336 Bytes
d25b671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
dataset:
  max_length: 128
  name: monology/pile-uncopyrighted
  split: train
model:
  device: cuda
  name: EleutherAI/pythia-410m
transcoding:
  batch_size: 512
  bias: true
  debug: false
  hidden_multiplier: 4
  layer_idx: 16
  learning_rate: 0.02
  model_type: Bilinear
  n_batches: 20
  n_batches_full: 3000
  optimizer_type: Muon