Spaces:

PFEemp2024
/

DCWIR-Demo

Runtime error

App Files Files Community

DCWIR-Demo / textattack /augmentation /recipes.py

PFEemp2024

add necessary file

63775f2 over 1 year ago

raw

history blame

9.26 kB

	"""
	Augmenter Recipes:
	===================

	Transformations and constraints can be used for simple NLP data augmentations. Here is a list of recipes for NLP data augmentations

	"""
	import random

	from textattack.constraints.pre_transformation import (
	RepeatModification,
	StopwordModification,
	)
	from textattack.constraints.semantics.sentence_encoders import UniversalSentenceEncoder

	from . import Augmenter

	DEFAULT_CONSTRAINTS = [RepeatModification(), StopwordModification()]


	class EasyDataAugmenter(Augmenter):
	"""An implementation of Easy Data Augmentation, which combines:

	- WordNet synonym replacement
	- Randomly replace words with their synonyms.
	- Word deletion
	- Randomly remove words from the sentence.
	- Word order swaps
	- Randomly swap the position of words in the sentence.
	- Random synonym insertion
	- Insert a random synonym of a random word at a random location.

	in one augmentation method.

	"EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks" (Wei and Zou, 2019)
	https://arxiv.org/abs/1901.11196
	"""

	def __init__(self, pct_words_to_swap=0.1, transformations_per_example=4):
	assert 0.0 <= pct_words_to_swap <= 1.0, "pct_words_to_swap must be in [0., 1.]"
	assert (
	transformations_per_example > 0
	), "transformations_per_example must be a positive integer"
	self.pct_words_to_swap = pct_words_to_swap
	self.transformations_per_example = transformations_per_example
	n_aug_each = max(transformations_per_example // 4, 1)

	self.synonym_replacement = WordNetAugmenter(
	pct_words_to_swap=pct_words_to_swap,
	transformations_per_example=n_aug_each,
	)
	self.random_deletion = DeletionAugmenter(
	pct_words_to_swap=pct_words_to_swap,
	transformations_per_example=n_aug_each,
	)
	self.random_swap = SwapAugmenter(
	pct_words_to_swap=pct_words_to_swap,
	transformations_per_example=n_aug_each,
	)
	self.random_insertion = SynonymInsertionAugmenter(
	pct_words_to_swap=pct_words_to_swap, transformations_per_example=n_aug_each
	)

	def augment(self, text):
	augmented_text = []
	augmented_text += self.synonym_replacement.augment(text)
	augmented_text += self.random_deletion.augment(text)
	augmented_text += self.random_swap.augment(text)
	augmented_text += self.random_insertion.augment(text)
	augmented_text = list(set(augmented_text))
	random.shuffle(augmented_text)
	return augmented_text[: self.transformations_per_example]

	def __repr__(self):
	return "EasyDataAugmenter"


	class SwapAugmenter(Augmenter):
	def __init__(self, **kwargs):
	from textattack.transformations import WordInnerSwapRandom

	transformation = WordInnerSwapRandom()
	super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs)


	class SynonymInsertionAugmenter(Augmenter):
	def __init__(self, **kwargs):
	from textattack.transformations import WordInsertionRandomSynonym

	transformation = WordInsertionRandomSynonym()
	super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs)


	class WordNetAugmenter(Augmenter):
	"""Augments text by replacing with synonyms from the WordNet thesaurus."""

	def __init__(self, **kwargs):
	from textattack.transformations import WordSwapWordNet

	transformation = WordSwapWordNet()
	super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs)


	class DeletionAugmenter(Augmenter):
	def __init__(self, **kwargs):
	from textattack.transformations import WordDeletion

	transformation = WordDeletion()
	super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs)


	class EmbeddingAugmenter(Augmenter):
	"""Augments text by transforming words with their embeddings."""

	def __init__(self, **kwargs):
	from textattack.transformations import WordSwapEmbedding

	transformation = WordSwapEmbedding(max_candidates=50)
	from textattack.constraints.semantics import WordEmbeddingDistance

	constraints = DEFAULT_CONSTRAINTS + [WordEmbeddingDistance(min_cos_sim=0.8)]
	super().__init__(transformation, constraints=constraints, **kwargs)


	class CharSwapAugmenter(Augmenter):
	"""Augments words by swapping characters out for other characters."""

	def __init__(self, **kwargs):
	from textattack.transformations import (
	CompositeTransformation,
	WordSwapNeighboringCharacterSwap,
	WordSwapRandomCharacterDeletion,
	WordSwapRandomCharacterInsertion,
	WordSwapRandomCharacterSubstitution,
	)

	transformation = CompositeTransformation(
	[
	# (1) Swap: Swap two adjacent letters in the word.
	WordSwapNeighboringCharacterSwap(),
	# (2) Substitution: Substitute a letter in the word with a random letter.
	WordSwapRandomCharacterSubstitution(),
	# (3) Deletion: Delete a random letter from the word.
	WordSwapRandomCharacterDeletion(),
	# (4) Insertion: Insert a random letter in the word.
	WordSwapRandomCharacterInsertion(),
	]
	)
	super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs)


	class CheckListAugmenter(Augmenter):
	"""Augments words by using the transformation methods provided by CheckList
	INV testing, which combines:

	- Name Replacement
	- Location Replacement
	- Number Alteration
	- Contraction/Extension

	"Beyond Accuracy: Behavioral Testing of NLP models with CheckList" (Ribeiro et al., 2020)
	https://arxiv.org/abs/2005.04118
	"""

	def __init__(self, **kwargs):
	from textattack.transformations import (
	CompositeTransformation,
	WordSwapChangeLocation,
	WordSwapChangeName,
	WordSwapChangeNumber,
	WordSwapContract,
	WordSwapExtend,
	)

	transformation = CompositeTransformation(
	[
	WordSwapChangeNumber(),
	WordSwapChangeLocation(),
	WordSwapChangeName(),
	WordSwapExtend(),
	WordSwapContract(),
	]
	)

	constraints = [DEFAULT_CONSTRAINTS[0]]

	super().__init__(transformation, constraints=constraints, **kwargs)


	class CLAREAugmenter(Augmenter):
	"""Li, Zhang, Peng, Chen, Brockett, Sun, Dolan.

	"Contextualized Perturbation for Textual Adversarial Attack" (Li et al., 2020)

	https://arxiv.org/abs/2009.07502

	CLARE builds on a pre-trained masked language model and modifies the inputs in a contextaware manner.
	We propose three contextualized perturbations, Replace, Insert and Merge, allowing for generating outputs
	of varied lengths.
	"""

	def __init__(
	self, model="distilroberta-base", tokenizer="distilroberta-base", **kwargs
	):
	import transformers

	from textattack.transformations import (
	CompositeTransformation,
	WordInsertionMaskedLM,
	WordMergeMaskedLM,
	WordSwapMaskedLM,
	)

	shared_masked_lm = transformers.AutoModelForCausalLM.from_pretrained(model)
	shared_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer)

	transformation = CompositeTransformation(
	[
	WordSwapMaskedLM(
	method="bae",
	masked_language_model=shared_masked_lm,
	tokenizer=shared_tokenizer,
	max_candidates=50,
	min_confidence=5e-4,
	),
	WordInsertionMaskedLM(
	masked_language_model=shared_masked_lm,
	tokenizer=shared_tokenizer,
	max_candidates=50,
	min_confidence=0.0,
	),
	WordMergeMaskedLM(
	masked_language_model=shared_masked_lm,
	tokenizer=shared_tokenizer,
	max_candidates=50,
	min_confidence=5e-3,
	),
	]
	)

	use_constraint = UniversalSentenceEncoder(
	threshold=0.7,
	metric="cosine",
	compare_against_original=True,
	window_size=15,
	skip_text_shorter_than_window=True,
	)

	constraints = DEFAULT_CONSTRAINTS + [use_constraint]

	super().__init__(transformation, constraints=constraints, **kwargs)


	class BackTranslationAugmenter(Augmenter):
	"""Sentence level augmentation that uses MarianMTModel to back-translate.

	https://huggingface.co/transformers/model_doc/marian.html
	"""

	def __init__(self, **kwargs):
	from textattack.transformations.sentence_transformations import BackTranslation

	transformation = BackTranslation(chained_back_translation=5)
	super().__init__(transformation, **kwargs)