Spaces:
Runtime error
Runtime error
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import logging | |
| import re | |
| from operator import attrgetter, itemgetter | |
| import torch | |
| import numpy as np | |
| import torch.distributed as dist | |
| import torch.nn as nn | |
| from .modules import PQConv2d, PQEmbedding, PQLinear | |
| from .pq import PQ | |
| def quantize_model_( | |
| model, | |
| size_tracker, | |
| layers_to_quantize, | |
| block_sizes_config, | |
| n_centroids_config, | |
| step=0, | |
| n_iter=15, | |
| eps=1e-6, | |
| max_tentatives=100, | |
| remove_weights=False, | |
| verbose=True, | |
| state_dict=None, | |
| ): | |
| """ | |
| Quantize a model in-place by stages. All the targeted | |
| layers are replaced by their quantized counterpart, | |
| and the model is ready for the finetuning of the | |
| centroids in a standard training loop (no modifications | |
| required). Note that we do not quantize biases. | |
| Args: | |
| - model: a nn.Module | |
| - size_tracker: useful for tracking quatization statistics | |
| - layers_to_quantize: a list containing regexps for | |
| filtering the layers to quantize at each stage according | |
| to their name (as in model.named_parameters()) | |
| - block_sizes_config: dict like | |
| { | |
| 'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), | |
| 'Linear': ('in_features', {'*': 8}) | |
| } | |
| For instance, all conv2d layers with kernel size 3x3 have | |
| a block size of 9 and all Linear layers are quantized with | |
| a block size of 8, irrespective of their size. | |
| - n_centroids_config: dict like | |
| { | |
| 'Conv2d': ('kernel_size', {'*': 256}), | |
| 'Linear': ('in_features', {'*': 256}) | |
| } | |
| For instance, all conv2d layers are quantized with 256 centroids | |
| - step: the layers to quantize inplace corresponding | |
| to layers_to_quantize[step] | |
| """ | |
| quantized_layers = get_layers( | |
| model, layers_to_quantize[step], remove_weights=remove_weights | |
| ) | |
| for layer in quantized_layers: | |
| # book-keeping | |
| is_master_process = (not dist.is_initialized()) or ( | |
| dist.is_initialized() and dist.get_rank() == 0 | |
| ) | |
| verbose = verbose and is_master_process | |
| # get block size and centroids | |
| module = attrgetter(layer)(model) | |
| block_size = get_param(module, layer, block_sizes_config) | |
| n_centroids = get_param(module, layer, n_centroids_config) | |
| if verbose: | |
| logging.info( | |
| f"Quantizing layer {layer} with block size {block_size} and {n_centroids} centroids" | |
| ) | |
| # quantize layer | |
| weight = module.weight.data.clone() | |
| is_bias = "bias" in [x[0] for x in module.named_parameters()] | |
| bias = module.bias.data.clone() if is_bias else None | |
| quantizer = PQ( | |
| weight, | |
| block_size, | |
| n_centroids=n_centroids, | |
| n_iter=n_iter, | |
| eps=eps, | |
| max_tentatives=max_tentatives, | |
| verbose=verbose, | |
| ) | |
| # quantization performed on all GPUs with same seed | |
| quantizer.encode() | |
| centroids = quantizer.centroids.contiguous() | |
| assignments = quantizer.assignments.contiguous() | |
| # If n_iter = 0 and state_dict is provided, then | |
| # we initialize random assignments and centroids to | |
| # random values of the appropriate dimensions | |
| # because the quantized model parameters will | |
| # overwritten by the state_dict later on. | |
| if n_iter == 0 and state_dict: | |
| # Initialize random centroids of the correct size | |
| centroids = torch.rand(centroids.size()) | |
| centroids.cuda() | |
| # Get counts and assignment keys from layer in loaded checkpoint. | |
| counts_key = layer + "." + "counts" | |
| assignment_key = layer + "." + "assignments" | |
| # Get number of different bins to include. | |
| counts = list(state_dict[counts_key].shape)[0] | |
| print(layer) | |
| print(state_dict[counts_key]) | |
| print(counts) | |
| # Initialize random assignments of the correct size | |
| # with an appropriate number of bins. | |
| num_assignments = list(state_dict[assignment_key].shape)[0] | |
| num_extra = num_assignments - counts | |
| print(num_assignments) | |
| print(num_extra) | |
| assignments_bins = torch.arange(counts) | |
| assignments_rand = torch.randint(0, counts - 1, (num_extra,)) | |
| assignments = torch.cat((assignments_bins, assignments_rand), 0) | |
| # assignments = assignments.type(torch.IntTensor) | |
| assignments.cuda() | |
| print("assignments") | |
| print(assignments) | |
| # broadcast results to make sure weights are up-to-date | |
| if dist.is_initialized(): | |
| dist.broadcast(centroids, 0) | |
| dist.broadcast(assignments, 0) | |
| # instantiate the quantized counterpart | |
| if isinstance(module, nn.Linear): | |
| out_features, in_features = map( | |
| lambda k: module.__dict__[k], ["out_features", "in_features"] | |
| ) | |
| quantized_module = PQLinear( | |
| centroids, assignments, bias, in_features, out_features | |
| ) | |
| elif isinstance(module, nn.Embedding): | |
| num_embeddings, embedding_dim = map( | |
| lambda k: module.__dict__[k], ["num_embeddings", "embedding_dim"] | |
| ) | |
| quantized_module = PQEmbedding( | |
| centroids, assignments, num_embeddings, embedding_dim | |
| ) | |
| elif isinstance(module, nn.Conv2d): | |
| out_channels, in_channels, kernel_size = map( | |
| lambda k: module.__dict__[k], | |
| ["out_channels", "in_channels", "kernel_size"], | |
| ) | |
| stride, padding, dilation, groups, padding_mode = map( | |
| lambda k: module.__dict__[k], | |
| ["stride", "padding", "dilation", "groups", "padding_mode"], | |
| ) | |
| quantized_module = PQConv2d( | |
| centroids, | |
| assignments, | |
| bias, | |
| in_channels, | |
| out_channels, | |
| kernel_size, | |
| stride=stride, | |
| padding=padding, | |
| dilation=dilation, | |
| groups=groups, | |
| padding_mode=padding_mode, | |
| ) | |
| else: | |
| raise ValueError(f"Module {module} not yet supported for quantization") | |
| # replace layer by its quantized counterpart | |
| attrsetter(layer)(model, quantized_module) | |
| # update statistics | |
| size_tracker.update(weight, block_size, n_centroids) | |
| # return name of quantized layers | |
| return quantized_layers | |
| def get_layers(model, filter_regexp, remove_weights=False): | |
| """ | |
| Filters out the layers according to a regexp. Note that | |
| we omit biases. | |
| Args: | |
| - model: a nn.Module | |
| - filter_regexp: a regexp to filter the layers to keep | |
| according to their name in model.named_parameters(). | |
| For instance, the regexp: | |
| down_layers\\.[123456]\\.(conv[12]|identity\\.conv)) | |
| is keeping blocks down_layers from 1 to 6, and inside | |
| each block is keeping conv1, conv2 and identity.conv. | |
| Remarks: | |
| - We add (module\\.)? at the beginning of the regexp to | |
| account for the possible use of nn.parallel.DataParallel | |
| """ | |
| # get all parameter names | |
| all_layers = map(itemgetter(0), model.named_parameters()) | |
| # remove biases | |
| all_layers = filter(lambda x: "bias" not in x, all_layers) | |
| # remove .weight in all other names (or .weight_orig is spectral norm) | |
| all_layers = map(lambda x: x.replace(".weight_orig", ""), all_layers) | |
| # remove weights indicates whether the weights extension should be removed, in addition to | |
| # weight_orig and weight extension on names | |
| if remove_weights: | |
| all_layers = map(lambda x: x.replace(".weights", ""), all_layers) | |
| all_layers = map(lambda x: x.replace(".weight", ""), all_layers) | |
| # return filtered layers | |
| filter_regexp = "(module\\.)?" + "(" + filter_regexp + ")" | |
| r = re.compile(filter_regexp) | |
| return list(filter(r.match, all_layers)) | |
| def get_param(module, layer_name, param_config): | |
| """ | |
| Given a quantization configuration, get the right parameter | |
| for the module to be quantized. | |
| Args: | |
| - module: a nn.Module | |
| - layer_name: the name of the layer | |
| - param_config: a dict like | |
| { | |
| 'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), | |
| 'Linear': ('in_features', {'*': 8}) | |
| } | |
| For instance, all conv2d layers with kernel size 3x3 have | |
| a block size of 9 and all Linear layers are quantized with | |
| a block size of 8, irrespective of their size. | |
| Remarks: | |
| - if 'fuzzy_name' is passed as a parameter, layers whose layer_name | |
| include 'fuzzy_name' will be assigned the given parameter. | |
| In the following example, conv.expand layers will have a block | |
| size of 9 while conv.reduce will have a block size of 4 and all | |
| other layers will have a block size of 2. | |
| { | |
| 'Conv2d': ('fuzzy_name', {'expand': 9, 'reduce': 4, '*': 2}), | |
| 'Linear': ('fuzzy_name', {'classifier': 8, 'projection': 4}) | |
| } | |
| """ | |
| layer_type = module.__class__.__name__ | |
| if layer_type not in param_config: | |
| raise KeyError(f"Layer type {layer_type} not in config for layer {module}") | |
| feature, params = param_config[module.__class__.__name__] | |
| if feature != "fuzzy_name": | |
| feature_value = str(getattr(module, feature)) | |
| if feature_value not in params: | |
| if "*" in params: | |
| feature_value = "*" | |
| else: | |
| raise KeyError( | |
| f"{feature}={feature_value} not in config for layer {module}" | |
| ) | |
| else: | |
| feature_values = [name for name in params if name in layer_name] | |
| if len(feature_values) == 0: | |
| if "*" in params: | |
| feature_value = "*" | |
| else: | |
| raise KeyError(f"name={layer_name} not in config for {module}") | |
| else: | |
| feature_value = feature_values[0] | |
| return params[feature_value] | |
| class SizeTracker(object): | |
| """ | |
| Class to keep track of the compressed network size with iPQ. | |
| Args: | |
| - model: a nn.Module | |
| Remarks: | |
| - The compressed size is the sum of three components | |
| for each layer in the network: | |
| (1) Storing the centroids given by iPQ in fp16 | |
| (2) Storing the assignments of the blocks in int8 | |
| (3) Storing all non-compressed elements such as biases | |
| - This cost in only valid if we use 256 centroids (then | |
| indexing can indeed by done with int8). | |
| """ | |
| def __init__(self, model): | |
| self.model = model | |
| self.size_non_compressed_model = self.compute_size() | |
| self.size_non_quantized = self.size_non_compressed_model | |
| self.size_index = 0 | |
| self.size_centroids = 0 | |
| self.n_quantized_layers = 0 | |
| def compute_size(self): | |
| """ | |
| Computes the size of the model (in MB). | |
| """ | |
| res = 0 | |
| for _, p in self.model.named_parameters(): | |
| res += p.numel() | |
| return res * 4 / 1024 / 1024 | |
| def update(self, W, block_size, n_centroids): | |
| """ | |
| Updates the running statistics when quantizing a new layer. | |
| """ | |
| # bits per weights | |
| bits_per_weight = np.log2(n_centroids) / block_size | |
| self.n_quantized_layers += 1 | |
| # size of indexing the subvectors of size block_size (in MB) | |
| size_index_layer = bits_per_weight * W.numel() / 8 / 1024 / 1024 | |
| self.size_index += size_index_layer | |
| # size of the centroids stored in float16 (in MB) | |
| size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024 | |
| self.size_centroids += size_centroids_layer | |
| # size of non-compressed layers, e.g. LayerNorms or biases (in MB) | |
| size_uncompressed_layer = W.numel() * 4 / 1024 / 1024 | |
| self.size_non_quantized -= size_uncompressed_layer | |
| def __repr__(self): | |
| size_compressed = ( | |
| self.size_index + self.size_centroids + self.size_non_quantized | |
| ) | |
| compression_ratio = self.size_non_compressed_model / size_compressed # NOQA | |
| return ( | |
| f"Non-compressed model size: {self.size_non_compressed_model:.2f} MB. " | |
| f"After quantizing {self.n_quantized_layers} layers, size " | |
| f"(indexing + centroids + other): {self.size_index:.2f} MB + " | |
| f"{self.size_centroids:.2f} MB + {self.size_non_quantized:.2f} MB = " | |
| f"{size_compressed:.2f} MB, compression ratio: {compression_ratio:.2f}x" | |
| ) | |
| def attrsetter(*items): | |
| def resolve_attr(obj, attr): | |
| attrs = attr.split(".") | |
| head = attrs[:-1] | |
| tail = attrs[-1] | |
| for name in head: | |
| obj = getattr(obj, name) | |
| return obj, tail | |
| def g(obj, val): | |
| for attr in items: | |
| resolved_obj, resolved_attr = resolve_attr(obj, attr) | |
| setattr(resolved_obj, resolved_attr, val) | |
| return g | |