Spaces:
Runtime error
Runtime error
| """ | |
| Faster Alzantot Genetic Algorithm | |
| =================================== | |
| (Certified Robustness to Adversarial Word Substitutions) | |
| """ | |
| from textattack import Attack | |
| from textattack.constraints.grammaticality.language_models import ( | |
| LearningToWriteLanguageModel, | |
| ) | |
| from textattack.constraints.overlap import MaxWordsPerturbed | |
| from textattack.constraints.pre_transformation import ( | |
| RepeatModification, | |
| StopwordModification, | |
| ) | |
| from textattack.constraints.semantics import WordEmbeddingDistance | |
| from textattack.goal_functions import UntargetedClassification | |
| from textattack.search_methods import AlzantotGeneticAlgorithm | |
| from textattack.transformations import WordSwapEmbedding | |
| from .attack_recipe import AttackRecipe | |
| class FasterGeneticAlgorithmJia2019(AttackRecipe): | |
| """Certified Robustness to Adversarial Word Substitutions. | |
| Robin Jia, Aditi Raghunathan, Kerem Göksel, Percy Liang (2019). | |
| https://arxiv.org/pdf/1909.00986.pdf | |
| """ | |
| def build(model_wrapper): | |
| # | |
| # Section 5: Experiments | |
| # | |
| # We base our sets of allowed word substitutions S(x, i) on the | |
| # substitutions allowed by Alzantot et al. (2018). They demonstrated that | |
| # their substitutions lead to adversarial examples that are qualitatively | |
| # similar to the original input and retain the original label, as judged | |
| # by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w | |
| # as the n = 8 nearest neighbors of w in a “counter-fitted” word vector | |
| # space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The | |
| # neighbors must also lie within some Euclidean distance threshold. They | |
| # also use a language model constraint to avoid nonsensical perturbations: | |
| # they allow substituting xi with x˜i ∈ N(xi) if and only if it does not | |
| # decrease the log-likelihood of the text under a pre-trained language | |
| # model by more than some threshold. | |
| # | |
| # We make three modifications to this approach: | |
| # | |
| # First, in Alzantot et al. (2018), the adversary | |
| # applies substitutions one at a time, and the | |
| # neighborhoods and language model scores are computed. | |
| # Equation (4) must be applied before the model | |
| # can combine information from multiple words, but it can | |
| # be delayed until after processing each word independently. | |
| # Note that the model itself classifies using a different | |
| # set of pre-trained word vectors; the counter-fitted vectors | |
| # are only used to define the set of allowed substitution words. | |
| # relative to the current altered version of the input. | |
| # This results in a hard-to-define attack surface, as | |
| # changing one word can allow or disallow changes | |
| # to other words. It also requires recomputing | |
| # language model scores at each iteration of the genetic | |
| # attack, which is inefficient. Moreover, the same | |
| # word can be substituted multiple times, leading | |
| # to semantic drift. We define allowed substitutions | |
| # relative to the original sentence x, and disallow | |
| # repeated substitutions. | |
| # | |
| # Second, we use a faster language model that allows us to query | |
| # longer contexts; Alzantot et al. (2018) use a slower language | |
| # model and could only query it with short contexts. | |
| # Finally, we use the language model constraint only | |
| # at test time; the model is trained against all perturbations in N(w). This encourages the model to be | |
| # robust to a larger space of perturbations, instead of | |
| # specializing for the particular choice of language | |
| # model. See Appendix A.3 for further details. [This is a model-specific | |
| # adjustment, so does not affect the attack recipe.] | |
| # | |
| # Appendix A.3: | |
| # | |
| # In Alzantot et al. (2018), the adversary applies replacements one at a | |
| # time, and the neighborhoods and language model scores are computed | |
| # relative to the current altered version of the input. This results in a | |
| # hard-to-define attack surface, as the same word can be replaced many | |
| # times, leading to semantic drift. We instead pre-compute the allowed | |
| # substitutions S(x, i) at index i based on the original x. We define | |
| # S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are | |
| # assigned by a pre-trained language model, and the window radius W and | |
| # threshold δ are hyperparameters. We use W = 6 and δ = 5. | |
| # | |
| # | |
| # Swap words with their embedding nearest-neighbors. | |
| # | |
| # Embedding: Counter-fitted Paragram Embeddings. | |
| # | |
| # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" | |
| # | |
| transformation = WordSwapEmbedding(max_candidates=8) | |
| # | |
| # Don't modify the same word twice or stopwords | |
| # | |
| constraints = [RepeatModification(), StopwordModification()] | |
| # | |
| # Maximum words perturbed percentage of 20% | |
| # | |
| constraints.append(MaxWordsPerturbed(max_percent=0.2)) | |
| # | |
| # Maximum word embedding euclidean distance of 0.5. | |
| # | |
| constraints.append(WordEmbeddingDistance(max_mse_dist=0.5)) | |
| # | |
| # Language Model | |
| # | |
| # | |
| # | |
| constraints.append( | |
| LearningToWriteLanguageModel( | |
| window_size=6, max_log_prob_diff=5.0, compare_against_original=True | |
| ) | |
| ) | |
| # constraints.append(LearningToWriteLanguageModel(window_size=5)) | |
| # | |
| # Goal is untargeted classification | |
| # | |
| goal_function = UntargetedClassification(model_wrapper) | |
| # | |
| # Perform word substitution with a genetic algorithm. | |
| # | |
| search_method = AlzantotGeneticAlgorithm( | |
| pop_size=60, max_iters=40, post_crossover_check=False | |
| ) | |
| return Attack(goal_function, constraints, transformation, search_method) | |