Commit
·
be0c125
1
Parent(s):
2598166
Add config
Browse files- config.json +4 -4
- operative_config.gin +6 -8
config.json
CHANGED
|
@@ -3,9 +3,9 @@
|
|
| 3 |
"architectures": [
|
| 4 |
"T5ForConditionalGeneration"
|
| 5 |
],
|
| 6 |
-
"d_ff":
|
| 7 |
-
"d_kv":
|
| 8 |
-
"d_model":
|
| 9 |
"decoder_start_token_id": 0,
|
| 10 |
"dropout_rate": 0.1,
|
| 11 |
"eos_token_id": 1,
|
|
@@ -16,7 +16,7 @@
|
|
| 16 |
"model_type": "t5",
|
| 17 |
"n_positions": 512,
|
| 18 |
"num_decoder_layers": 12,
|
| 19 |
-
"num_heads":
|
| 20 |
"num_layers": 12,
|
| 21 |
"pad_token_id": 0,
|
| 22 |
"relative_attention_num_buckets": 32,
|
|
|
|
| 3 |
"architectures": [
|
| 4 |
"T5ForConditionalGeneration"
|
| 5 |
],
|
| 6 |
+
"d_ff": 3072,
|
| 7 |
+
"d_kv": 64,
|
| 8 |
+
"d_model": 768,
|
| 9 |
"decoder_start_token_id": 0,
|
| 10 |
"dropout_rate": 0.1,
|
| 11 |
"eos_token_id": 1,
|
|
|
|
| 16 |
"model_type": "t5",
|
| 17 |
"n_positions": 512,
|
| 18 |
"num_decoder_layers": 12,
|
| 19 |
+
"num_heads": 12,
|
| 20 |
"num_layers": 12,
|
| 21 |
"pad_token_id": 0,
|
| 22 |
"relative_attention_num_buckets": 32,
|
operative_config.gin
CHANGED
|
@@ -9,15 +9,15 @@ import t5.models.mesh_transformer
|
|
| 9 |
|
| 10 |
# Macros:
|
| 11 |
# ==============================================================================
|
| 12 |
-
d_ff =
|
| 13 |
-
d_kv =
|
| 14 |
-
d_model =
|
| 15 |
dropout_rate = 0.0
|
| 16 |
inputs_length = 512
|
| 17 |
mean_noise_span_length = 3.0
|
| 18 |
MIXTURE_NAME = 'c4_v220_unsupervised'
|
| 19 |
noise_density = 0.15
|
| 20 |
-
num_heads =
|
| 21 |
num_layers = 12
|
| 22 |
|
| 23 |
# Parameters for adafactor_decay_rate_pow:
|
|
@@ -146,7 +146,6 @@ encoder/make_layer_stack.num_layers = %num_layers
|
|
| 146 |
mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
|
| 147 |
mesh_train_dataset_fn.pack = True
|
| 148 |
mesh_train_dataset_fn.seed = None
|
| 149 |
-
mesh_train_dataset_fn.shuffle = True
|
| 150 |
mesh_train_dataset_fn.use_cached = 1
|
| 151 |
|
| 152 |
# Parameters for noise_span_to_unique_sentinel:
|
|
@@ -195,7 +194,6 @@ rewrite_stack_variables.max_combined_variable_size = 536870912
|
|
| 195 |
# ==============================================================================
|
| 196 |
run.autostack = True
|
| 197 |
run.batch_size = ('tokens_per_batch', 65536)
|
| 198 |
-
run.checkpoint_input_pipeline = False
|
| 199 |
run.dataset_split = 'train'
|
| 200 |
run.ensemble_inputs = None
|
| 201 |
run.eval_checkpoint_step = None
|
|
@@ -217,7 +215,7 @@ run.optimizer = @optimize.AdafactorOptimizer
|
|
| 217 |
run.output_eval_examples = True
|
| 218 |
run.perplexity_eval_steps = 100
|
| 219 |
run.predict_fn = None
|
| 220 |
-
run.save_checkpoints_steps =
|
| 221 |
run.seen_data_init_step = 0
|
| 222 |
run.sequence_length = {'inputs': 512, 'targets': 128}
|
| 223 |
run.skip_seen_data = False
|
|
@@ -312,7 +310,7 @@ tpu_estimator_model_fn.tpu_summaries = False
|
|
| 312 |
# Parameters for tpu_mesh_shape:
|
| 313 |
# ==============================================================================
|
| 314 |
tpu_mesh_shape.ensemble_parallelism = None
|
| 315 |
-
tpu_mesh_shape.model_parallelism =
|
| 316 |
tpu_mesh_shape.tpu_topology = '4x4'
|
| 317 |
|
| 318 |
# Parameters for unit_scaling_convention:
|
|
|
|
| 9 |
|
| 10 |
# Macros:
|
| 11 |
# ==============================================================================
|
| 12 |
+
d_ff = 3072
|
| 13 |
+
d_kv = 64
|
| 14 |
+
d_model = 768
|
| 15 |
dropout_rate = 0.0
|
| 16 |
inputs_length = 512
|
| 17 |
mean_noise_span_length = 3.0
|
| 18 |
MIXTURE_NAME = 'c4_v220_unsupervised'
|
| 19 |
noise_density = 0.15
|
| 20 |
+
num_heads = 12
|
| 21 |
num_layers = 12
|
| 22 |
|
| 23 |
# Parameters for adafactor_decay_rate_pow:
|
|
|
|
| 146 |
mesh_train_dataset_fn.mixture_or_task_name = %MIXTURE_NAME
|
| 147 |
mesh_train_dataset_fn.pack = True
|
| 148 |
mesh_train_dataset_fn.seed = None
|
|
|
|
| 149 |
mesh_train_dataset_fn.use_cached = 1
|
| 150 |
|
| 151 |
# Parameters for noise_span_to_unique_sentinel:
|
|
|
|
| 194 |
# ==============================================================================
|
| 195 |
run.autostack = True
|
| 196 |
run.batch_size = ('tokens_per_batch', 65536)
|
|
|
|
| 197 |
run.dataset_split = 'train'
|
| 198 |
run.ensemble_inputs = None
|
| 199 |
run.eval_checkpoint_step = None
|
|
|
|
| 215 |
run.output_eval_examples = True
|
| 216 |
run.perplexity_eval_steps = 100
|
| 217 |
run.predict_fn = None
|
| 218 |
+
run.save_checkpoints_steps = 10000
|
| 219 |
run.seen_data_init_step = 0
|
| 220 |
run.sequence_length = {'inputs': 512, 'targets': 128}
|
| 221 |
run.skip_seen_data = False
|
|
|
|
| 310 |
# Parameters for tpu_mesh_shape:
|
| 311 |
# ==============================================================================
|
| 312 |
tpu_mesh_shape.ensemble_parallelism = None
|
| 313 |
+
tpu_mesh_shape.model_parallelism = 1
|
| 314 |
tpu_mesh_shape.tpu_topology = '4x4'
|
| 315 |
|
| 316 |
# Parameters for unit_scaling_convention:
|