Spaces:
Runtime error
Runtime error
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| from fairseq import utils | |
| from fairseq.models import ( | |
| FairseqLanguageModel, | |
| register_model, | |
| register_model_architecture, | |
| ) | |
| from fairseq.models.lightconv import Embedding, LightConvDecoder | |
| from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder | |
| class LightConvLanguageModel(FairseqLanguageModel): | |
| def __init__(self, decoder): | |
| super().__init__(decoder) | |
| def add_args(parser): | |
| """Add model-specific arguments to the parser.""" | |
| parser.add_argument( | |
| "--dropout", | |
| default=0.1, | |
| type=float, | |
| metavar="D", | |
| help="dropout probability", | |
| ) | |
| parser.add_argument( | |
| "--attention-dropout", | |
| default=0.0, | |
| type=float, | |
| metavar="D", | |
| help="dropout probability for attention weights", | |
| ) | |
| parser.add_argument( | |
| "--relu-dropout", | |
| default=0.0, | |
| type=float, | |
| metavar="D", | |
| help="dropout probability after ReLU in FFN", | |
| ) | |
| parser.add_argument( | |
| "--input-dropout", | |
| type=float, | |
| metavar="D", | |
| help="dropout probability of the inputs", | |
| ) | |
| parser.add_argument( | |
| "--decoder-embed-dim", | |
| type=int, | |
| metavar="N", | |
| help="decoder embedding dimension", | |
| ) | |
| parser.add_argument( | |
| "--decoder-output-dim", | |
| type=int, | |
| metavar="N", | |
| help="decoder output dimension", | |
| ) | |
| parser.add_argument( | |
| "--decoder-input-dim", type=int, metavar="N", help="decoder input dimension" | |
| ) | |
| parser.add_argument( | |
| "--decoder-ffn-embed-dim", | |
| type=int, | |
| metavar="N", | |
| help="decoder embedding dimension for FFN", | |
| ) | |
| parser.add_argument( | |
| "--decoder-layers", type=int, metavar="N", help="num decoder layers" | |
| ) | |
| parser.add_argument( | |
| "--decoder-attention-heads", | |
| type=int, | |
| metavar="N", | |
| help="num decoder attention heads or LightConv/DynamicConv heads", | |
| ) | |
| parser.add_argument( | |
| "--decoder-normalize-before", | |
| default=False, | |
| action="store_true", | |
| help="apply layernorm before each decoder block", | |
| ) | |
| parser.add_argument( | |
| "--adaptive-softmax-cutoff", | |
| metavar="EXPR", | |
| help="comma separated list of adaptive softmax cutoff points. " | |
| "Must be used with adaptive_loss criterion", | |
| ) | |
| parser.add_argument( | |
| "--adaptive-softmax-dropout", | |
| type=float, | |
| metavar="D", | |
| help="sets adaptive softmax dropout for the tail projections", | |
| ) | |
| parser.add_argument( | |
| "--adaptive-softmax-factor", | |
| type=float, | |
| metavar="N", | |
| help="adaptive input factor", | |
| ) | |
| parser.add_argument( | |
| "--no-token-positional-embeddings", | |
| default=False, | |
| action="store_true", | |
| help="if set, disables positional embeddings (outside self attention)", | |
| ) | |
| parser.add_argument( | |
| "--share-decoder-input-output-embed", | |
| default=False, | |
| action="store_true", | |
| help="share decoder input and output embeddings", | |
| ) | |
| parser.add_argument( | |
| "--character-embeddings", | |
| default=False, | |
| action="store_true", | |
| help="if set, uses character embedding convolutions to produce token embeddings", | |
| ) | |
| parser.add_argument( | |
| "--character-filters", | |
| type=str, | |
| metavar="LIST", | |
| default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", | |
| help="size of character embeddings", | |
| ) | |
| parser.add_argument( | |
| "--character-embedding-dim", | |
| type=int, | |
| metavar="N", | |
| default=4, | |
| help="size of character embeddings", | |
| ) | |
| parser.add_argument( | |
| "--char-embedder-highway-layers", | |
| type=int, | |
| metavar="N", | |
| default=2, | |
| help="number of highway layers for character token embeddder", | |
| ) | |
| parser.add_argument( | |
| "--adaptive-input", | |
| default=False, | |
| action="store_true", | |
| help="if set, uses adaptive input", | |
| ) | |
| parser.add_argument( | |
| "--adaptive-input-factor", | |
| type=float, | |
| metavar="N", | |
| help="adaptive input factor", | |
| ) | |
| parser.add_argument( | |
| "--adaptive-input-cutoff", | |
| metavar="EXPR", | |
| help="comma separated list of adaptive input cutoff points.", | |
| ) | |
| parser.add_argument( | |
| "--tie-adaptive-weights", | |
| action="store_true", | |
| help="if set, ties the weights of adaptive softmax and adaptive input", | |
| ) | |
| parser.add_argument( | |
| "--tie-adaptive-proj", | |
| action="store_true", | |
| help="if set, ties the projection weights of adaptive softmax and adaptive input", | |
| ) | |
| parser.add_argument( | |
| "--decoder-learned-pos", | |
| action="store_true", | |
| help="use learned positional embeddings in the decoder", | |
| ) | |
| """LightConv and DynamicConv arguments""" | |
| parser.add_argument( | |
| "--decoder-kernel-size-list", | |
| type=lambda x: utils.eval_str_list(x, int), | |
| help='list of kernel size (default: "[3,7,15,31,31,31]")', | |
| ) | |
| parser.add_argument( | |
| "--decoder-glu", type=utils.eval_bool, help="glu after in proj" | |
| ) | |
| parser.add_argument( | |
| "--decoder-conv-type", | |
| default="dynamic", | |
| type=str, | |
| choices=["dynamic", "lightweight"], | |
| help="type of convolution", | |
| ) | |
| parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) | |
| parser.add_argument( | |
| "--weight-dropout", | |
| type=float, | |
| metavar="D", | |
| help="dropout probability for conv weights", | |
| ) | |
| def build_model(cls, args, task): | |
| """Build a new model instance.""" | |
| # make sure all arguments are present in older models | |
| base_lm_architecture(args) | |
| if getattr(args, "max_source_positions", None) is None: | |
| args.max_source_positions = args.tokens_per_sample | |
| if getattr(args, "max_target_positions", None) is None: | |
| args.max_target_positions = args.tokens_per_sample | |
| if args.character_embeddings: | |
| embed_tokens = CharacterTokenEmbedder( | |
| task.dictionary, | |
| eval(args.character_filters), | |
| args.character_embedding_dim, | |
| args.decoder_embed_dim, | |
| args.char_embedder_highway_layers, | |
| ) | |
| elif args.adaptive_input: | |
| embed_tokens = AdaptiveInput( | |
| len(task.dictionary), | |
| task.dictionary.pad(), | |
| args.decoder_input_dim, | |
| args.adaptive_input_factor, | |
| args.decoder_embed_dim, | |
| utils.eval_str_list(args.adaptive_input_cutoff, type=int), | |
| ) | |
| else: | |
| embed_tokens = Embedding( | |
| len(task.dictionary), args.decoder_input_dim, task.dictionary.pad() | |
| ) | |
| if args.tie_adaptive_weights: | |
| assert args.adaptive_input | |
| assert args.adaptive_input_factor == args.adaptive_softmax_factor | |
| assert ( | |
| args.adaptive_softmax_cutoff == args.adaptive_input_cutoff | |
| ), "{} != {}".format( | |
| args.adaptive_softmax_cutoff, args.adaptive_input_cutoff | |
| ) | |
| assert args.decoder_input_dim == args.decoder_output_dim | |
| decoder = LightConvDecoder( | |
| args, | |
| task.output_dictionary, | |
| embed_tokens, | |
| no_encoder_attn=True, | |
| final_norm=False, | |
| ) | |
| return LightConvLanguageModel(decoder) | |
| def base_lm_architecture(args): | |
| args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) | |
| args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) | |
| args.decoder_layers = getattr(args, "decoder_layers", 6) | |
| args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) | |
| args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) | |
| args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) | |
| args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) | |
| args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) | |
| args.character_embeddings = getattr(args, "character_embeddings", False) | |
| args.decoder_output_dim = getattr( | |
| args, "decoder_output_dim", args.decoder_embed_dim | |
| ) | |
| args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) | |
| args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim) | |
| # The model training is not stable without this | |
| args.decoder_normalize_before = True | |
| args.adaptive_input = getattr(args, "adaptive_input", False) | |
| args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) | |
| args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) | |
| args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) | |
| args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) | |
| args.decoder_kernel_size_list = getattr( | |
| args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31] | |
| ) | |
| if len(args.decoder_kernel_size_list) == 1: | |
| args.decoder_kernel_size_list = ( | |
| args.decoder_kernel_size_list * args.decoder_layers | |
| ) | |
| assert ( | |
| len(args.decoder_kernel_size_list) == args.decoder_layers | |
| ), "decoder_kernel_size_list doesn't match decoder_layers" | |
| args.decoder_glu = getattr(args, "decoder_glu", True) | |
| args.input_dropout = getattr(args, "input_dropout", 0.1) | |
| args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout) | |
| def lightconv_lm_gbw(args): | |
| args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) | |
| args.dropout = getattr(args, "dropout", 0.1) | |
| args.attention_dropout = getattr(args, "attention_dropout", 0.1) | |
| args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) | |
| args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) | |
| base_lm_architecture(args) | |