Spaces:
Runtime error
Runtime error
| # ONE EPOCH = one forward pass and one backward pass of all the training examples. | |
| # | |
| # BATCH SIZE = the number of training examples in one forward/backward pass. The | |
| # higher the batch size, the more memory space you'll need. | |
| # | |
| # NUMBER OF ITERATIONS = number of passes, each pass using [batch size] number of | |
| # examples. To be clear, one pass = one forward pass + one backward pass. | |
| # | |
| # Example: if you have 1000 training examples, and your batch size is 500, then | |
| # it will take 2 iterations to complete 1 epoch. | |
| import os | |
| import time | |
| import math | |
| import torch | |
| import torch.distributed as dist | |
| from torch.utils.data.distributed import DistributedSampler | |
| from torch.utils.data import DataLoader | |
| from numpy import finfo | |
| from Tacotron2 import tacotron_2 | |
| from fp16_optimizer import FP16_Optimizer | |
| # from distributed import apply_gradient_allreduce | |
| from loss_function import Tacotron2Loss | |
| from logger import Tacotron2Logger | |
| def batchnorm_to_float(module): | |
| """Converts batch norm modules to FP32""" | |
| if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): | |
| module.float() | |
| for child in module.children(): | |
| batchnorm_to_float(child) | |
| return module | |
| def reduce_tensor(tensor, n_gpus): | |
| # this function is recorded in the computation graph. Gradients propagating to the cloned tensor will propagate to | |
| # the original tensor | |
| rt = tensor.clone() | |
| # Each rank has a tensor and all_reduce sums up all tensors from different ranks to all ranks. Computes the average | |
| # of the tensor results of all ranks (a rank is a gpu as far as I understood): | |
| dist.all_reduce(rt, op=dist.reduce_op.SUM) | |
| rt /= n_gpus | |
| return rt | |
| def prepare_directories_and_logger(output_directory, log_directory, rank): | |
| if rank == 0: | |
| if not os.path.isdir(output_directory): | |
| os.makedirs(output_directory) | |
| os.chmod(output_directory, 0o775) | |
| logger = Tacotron2Logger(os.path.join(output_directory, log_directory)) | |
| # logger = None | |
| else: | |
| logger = None | |
| return logger | |
| def warm_start_model(checkpoint_path, model): | |
| assert os.path.isfile(checkpoint_path) | |
| print("Warm starting model from checkpoint '{}'".format(checkpoint_path)) | |
| checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') | |
| model.load_state_dict(checkpoint_dict['state_dict']) | |
| return model | |
| def load_checkpoint(checkpoint_path, model, optimizer): | |
| assert os.path.isfile(checkpoint_path) | |
| print("Loading checkpoint '{}'".format(checkpoint_path)) | |
| checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') | |
| model.load_state_dict(checkpoint_dict['state_dict']) | |
| optimizer.load_state_dict(checkpoint_dict['optimizer']) | |
| learning_rate = checkpoint_dict['learning_rate'] | |
| iteration = checkpoint_dict['iteration'] | |
| print("Loaded checkpoint '{}' from iteration {}".format(checkpoint_path, iteration)) | |
| return model, optimizer, learning_rate, iteration | |
| def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): | |
| print("Saving model and optimizer state at iteration {} to {}".format(iteration, filepath)) | |
| torch.save({'iteration': iteration, | |
| 'state_dict': model.state_dict(), | |
| 'optimizer': optimizer.state_dict(), | |
| 'learning_rate': learning_rate}, filepath) | |
| def init_distributed(hyper_params, n_gpus, rank, group_name): | |
| assert torch.cuda.is_available(), "Distributed mode requires CUDA" | |
| print("Initializing distributed") | |
| # Set CUDA device so everything is done on the right GPU | |
| torch.cuda.set_device(rank % torch.cuda.device_count()) | |
| # Initialize distributed communication | |
| torch.distributed.init_process_group(backend=hyper_params['dist_backend'], rank=rank, world_size=n_gpus, | |
| init_method=hyper_params['dist_url'], group_name=group_name) | |
| print("Initializing distributed: Done") | |
| def load_model(hyper_params): | |
| # according to the documentation, it is recommended to move a model to GPU before constructing the optimizer | |
| # model = tacotron_2(hyper_params).cuda() | |
| model = tacotron_2(hyper_params) | |
| if hyper_params['fp16_run']: # converts everything into half type (16 bits) | |
| model = batchnorm_to_float(model.half()) | |
| model.decoder.attention_layer.score_mask_value = float(finfo('float16').min) | |
| # if hyper_params['distributed_run']: | |
| # model = apply_gradient_allreduce(model) | |
| return model | |
| def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank): | |
| """Handles all the validation scoring and printing""" | |
| # We change to eval() because this is an evaluation stage and not a training | |
| model.eval() | |
| # temporarily set all the requires_grad flag to false | |
| with torch.no_grad(): | |
| # Sampler that restricts data loading to a subset of the dataset. Distributed sampler for distributed batch. | |
| # Which samples take (randomization?) | |
| val_sampler = DistributedSampler(valset) if distributed_run else None | |
| # data loader wraper to the validation data (same as for the training data) | |
| val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=False, batch_size=batch_size, | |
| pin_memory=False, collate_fn=collate_fn) | |
| val_loss = 0.0 | |
| for i, batch in enumerate(val_loader): | |
| x, y = model.parse_batch(batch) | |
| y_pred = model(x) | |
| _, _, _, _, gst_scores = y_pred | |
| if i == 0: | |
| validation_gst_scores = gst_scores | |
| else: | |
| validation_gst_scores = torch.cat((validation_gst_scores, gst_scores), 0) | |
| loss = criterion(y_pred, y) | |
| if distributed_run: | |
| reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() # gets the pure float value with item() | |
| else: | |
| reduced_val_loss = loss.item() | |
| val_loss += reduced_val_loss | |
| val_loss = val_loss / (i + 1) # Averaged val_loss from all batches | |
| model.train() | |
| if rank == 0: | |
| print("Validation loss {}: {:9f} ".format(iteration, val_loss)) # I changed this | |
| # print("GST scores of the validation set: {}".format(validation_gst_scores.shape)) | |
| logger.log_validation(reduced_val_loss, model, y, y_pred, validation_gst_scores, iteration) | |
| # ------------------------------------------- MAIN TRAINING METHOD -------------------------------------------------- # | |
| def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, | |
| hyper_params, train_loader, valset, collate_fn): | |
| """Training and validation method with logging results to tensorboard and stdout | |
| :param output_directory (string): directory to save checkpoints | |
| :param log_directory (string): directory to save tensorboard logs | |
| :param checkpoint_path (string): checkpoint path | |
| :param n_gpus (int): number of gpus | |
| :param rank (int): rank of current gpu | |
| :param hyper_params (object dictionary): dictionary with all hyper parameters | |
| """ | |
| # Check whether is a distributed running | |
| if hyper_params['distributed_run']: | |
| init_distributed(hyper_params, n_gpus, rank, group_name) | |
| # set the same fixed seed to reproduce same results everytime we train | |
| torch.manual_seed(hyper_params['seed']) | |
| torch.cuda.manual_seed(hyper_params['seed']) | |
| model = load_model(hyper_params) | |
| learning_rate = hyper_params['learning_rate'] | |
| optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hyper_params['weight_decay']) | |
| if hyper_params['fp16_run']: | |
| optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=hyper_params['dynamic_loss_scaling']) | |
| # Define the criterion of the loss function. The objective. | |
| criterion = Tacotron2Loss() | |
| logger = prepare_directories_and_logger(output_directory, log_directory, rank) | |
| # logger = '' | |
| iteration = 0 | |
| epoch_offset = 0 | |
| if checkpoint_path is not None: | |
| if warm_start: | |
| # Re-start the model from the last checkpoint if we save the parameters and don't want to start from 0 | |
| model = warm_start_model(checkpoint_path, model) | |
| else: | |
| # CHECK THIS OUT!!! | |
| model, optimizer, _learning_rate, iteration = load_checkpoint(checkpoint_path, model, optimizer) | |
| if hyper_params['use_saved_learning_rate']: | |
| learning_rate = _learning_rate | |
| iteration += 1 # next iteration is iteration + 1 | |
| epoch_offset = max(0, int(iteration / len(train_loader))) | |
| # Set this to make all modules and regularization aware this is the training stage: | |
| model.train() | |
| # MAIN LOOP | |
| for epoch in range(epoch_offset, hyper_params['epochs']): | |
| print("Epoch: {}".format(epoch)) | |
| for i, batch in enumerate(train_loader): | |
| start = time.perf_counter() | |
| # CHECK THIS OUT!!! | |
| for param_group in optimizer.param_groups: | |
| param_group['lr'] = learning_rate | |
| model.zero_grad() | |
| input_data, output_target = model.parse_batch(batch) | |
| output_predicted = model(input_data) | |
| loss = criterion(output_predicted, output_target) | |
| if hyper_params['distributed_run']: | |
| reduced_loss = reduce_tensor(loss.data, n_gpus).item() | |
| else: | |
| reduced_loss = loss.item() | |
| if hyper_params['fp16_run']: | |
| optimizer.backward(loss) # transformed optimizer into fp16 type | |
| grad_norm = optimizer.clip_fp32_grads(hyper_params['grad_clip_thresh']) | |
| else: | |
| loss.backward() | |
| grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hyper_params['grad_clip_thresh']) | |
| # Performs a single optimization step (parameter update) | |
| optimizer.step() | |
| # This boolean controls overflow when running in fp16 optimizer | |
| overflow = optimizer.overflow if hyper_params['fp16_run'] else False | |
| # If overflow is True, it will not enter. If isnan is True, it will not enter neither. | |
| if not overflow and not math.isnan(reduced_loss) and rank == 0: | |
| duration = time.perf_counter() - start | |
| print("Train loss {} {:.6f} Grand Norm {:.6f} {:.2f}s/it".format(iteration, reduced_loss, | |
| grad_norm, duration)) | |
| # logs training information of the current iteration | |
| logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) | |
| # Every iters_per_checkpoint steps there is a validation of the model and its updated parameters | |
| if not overflow and (iteration % hyper_params['iters_per_checkpoint'] == 0): | |
| validate(model, criterion, valset, iteration, hyper_params['batch_size'], n_gpus, collate_fn, | |
| logger, hyper_params['distributed_run'], rank) | |
| if rank == 0: | |
| checkpoint_path = os.path.join(output_directory, "checkpoint_{}".format(iteration)) | |
| save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) | |
| iteration += 1 | |