Spaces:
Paused
Paused
| # Copyright 2020-2025 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| import torch | |
| from datasets import load_dataset | |
| from torch.optim import Adam | |
| from tqdm import tqdm | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| HfArgumentParser, | |
| RobertaForSequenceClassification, | |
| RobertaTokenizer, | |
| set_seed, | |
| ) | |
| from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, create_reference_model | |
| from trl.core import LengthSampler | |
| tqdm.pandas() | |
| ######################################################################## | |
| # This is a fully working simple example to use trl with accelerate. | |
| # | |
| # This example fine-tunes a GPTJ model to generate less toxic contents | |
| # by using allenai/real-toxicity-prompts dataset. We use PPO | |
| # (proximal policy optimization) to optimize the model. | |
| # in any of the following settings (with the same script): | |
| # - single CPU or single GPU | |
| # - multi GPUS (using PyTorch distributed mode) | |
| # - multi GPUS (using DeepSpeed ZeRO-Offload stages 1 & 2) | |
| # - fp16 (mixed-precision) or fp32 (normal precision) | |
| # | |
| # To run it in each of these various modes, first initialize the accelerate | |
| # configuration with `accelerate config` | |
| # | |
| ######################################################################## | |
| # We first define the configuration of the experiment, defining the model, the dataset, | |
| # the training parameters, and the PPO parameters. | |
| # Check the default arguments in the `PPOConfig` class for more details. | |
| # If you want to log with tensorboard, add the kwarg | |
| # `project_kwargs={"logging_dir": PATH_TO_LOGS}` to the PPOConfig. | |
| class ScriptArguments: | |
| """ | |
| The name of the Casual LM model we wish to fine-tune with PPO | |
| """ | |
| # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode | |
| # models like gpt-neo* models are more suitable. | |
| model_name: Optional[str] = field(default="ybelkada/gpt-j-6b-sharded-bf16", metadata={"help": "the model name"}) | |
| log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"}) | |
| learning_rate: Optional[float] = field(default=(1.47e-5) * 2, metadata={"help": "the learning rate"}) | |
| mini_batch_size: Optional[int] = field(default=4, metadata={"help": "the PPO minibatch size"}) | |
| batch_size: Optional[int] = field(default=16, metadata={"help": "the batch size"}) | |
| gradient_accumulation_steps: Optional[int] = field( | |
| default=1, metadata={"help": "the number of gradient accumulation steps"} | |
| ) | |
| model_save_path: Optional[str] = field( | |
| default="./gpt-j-6B-detoxified-long-context-26-shl-1e4-final", | |
| metadata={"help": "the path to save the model"}, | |
| ) | |
| parser = HfArgumentParser(ScriptArguments) | |
| script_args = parser.parse_args_into_dataclasses()[0] | |
| config = PPOConfig( | |
| model_name=script_args.model_name, | |
| learning_rate=script_args.learning_rate, | |
| log_with=script_args.log_with, | |
| ppo_epochs=100, | |
| mini_batch_size=script_args.mini_batch_size, | |
| batch_size=script_args.batch_size, | |
| gradient_accumulation_steps=script_args.gradient_accumulation_steps, | |
| ) | |
| # Below is an example function to build the dataset. In our case, we use the IMDB dataset | |
| # from the `datasets` library. One should customize this function to train the model on | |
| # its own dataset. | |
| def build_dataset( | |
| config, dataset_name="allenai/real-toxicity-prompts", input_min_text_length=5, input_max_text_length=10 | |
| ): | |
| """ | |
| Build dataset for training. This builds the dataset from `load_dataset`, one should | |
| customize this function to train the model on its own dataset. | |
| Args: | |
| dataset_name (`str`): | |
| The name of the dataset to be loaded. | |
| Returns: | |
| dataloader (`torch.utils.data.DataLoader`): | |
| The dataloader for the dataset. | |
| """ | |
| tokenizer = AutoTokenizer.from_pretrained(config.model_name) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| ds = load_dataset(dataset_name, split="train") | |
| def filter_fn(sample): | |
| toxicity = sample["prompt"]["toxicity"] | |
| return toxicity is not None and toxicity > 0.3 | |
| ds = ds.filter(filter_fn, batched=False) | |
| input_size = LengthSampler(input_min_text_length, input_max_text_length) | |
| def tokenize(sample): | |
| prompt = sample["prompt"]["text"] | |
| continuation = sample["continuation"]["text"] | |
| sample["input_ids"] = tokenizer.encode(prompt + continuation)[: input_size()] | |
| sample["query"] = tokenizer.decode(sample["input_ids"]) | |
| return sample | |
| ds = ds.map(tokenize, batched=False) | |
| ds.set_format(type="torch") | |
| ds = ds.train_test_split(test_size=0.2, shuffle=False)["train"] | |
| return ds | |
| # We retrieve the dataloader by calling the `build_dataset` function. | |
| min_input_length = 30 | |
| max_input_length = 40 | |
| dataset = build_dataset(config, input_min_text_length=min_input_length, input_max_text_length=max_input_length) | |
| def collator(data): | |
| return {key: [d[key] for d in data] for key in data[0]} | |
| # set seed before initializing value head for deterministic eval | |
| set_seed(config.seed) | |
| # Now let's build the model, the reference model, and the tokenizer. We first load the model | |
| # in bfloat16 to save memory using `transformers`. | |
| model = AutoModelForCausalLM.from_pretrained(config.model_name, torch_dtype=torch.bfloat16) | |
| # And then we pass the loaded model to `AutoModelForCausalLMWithValueHead`. | |
| model = AutoModelForCausalLMWithValueHead.from_pretrained(model) | |
| # We create a reference model by sharing 20 layers | |
| ref_model = create_reference_model(model, num_shared_layers=20) | |
| # We make sure to use `Adam` optimizer on the model parameters that require gradients. | |
| optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate) | |
| # GPT-2 / GPT-J tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token. | |
| # only for this model. | |
| tokenizer = AutoTokenizer.from_pretrained(config.model_name) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # We then build the PPOTrainer, passing the model, the reference model, the tokenizer | |
| ppo_trainer = PPOTrainer( | |
| config, | |
| model, | |
| ref_model=ref_model, | |
| tokenizer=tokenizer, | |
| dataset=dataset, | |
| data_collator=collator, | |
| optimizer=optimizer, | |
| ) | |
| # We then build the reward pipeline, we will use the toxicity model to compute the reward. | |
| # We first load the toxicity model and tokenizer. | |
| toxicity_model_id = "facebook/roberta-hate-speech-dynabench-r4-target" | |
| toxicity_tokenizer = RobertaTokenizer.from_pretrained(toxicity_model_id) | |
| # We load the toxicity model in fp16 to save memory. | |
| toxicity_model = RobertaForSequenceClassification.from_pretrained(toxicity_model_id, torch_dtype=torch.float16).to( | |
| ppo_trainer.accelerator.device | |
| ) | |
| # We then define the arguments to pass to the `generate` function. These arguments | |
| # are passed to the `generate` function of the PPOTrainer, which is a wrapper around | |
| # the `generate` function of the trained model. | |
| generation_kwargs = { | |
| "min_length": -1, | |
| "top_k": 0.0, | |
| "top_p": 1.0, | |
| "do_sample": True, | |
| "pad_token_id": tokenizer.eos_token_id, | |
| } | |
| output_min_length = 20 | |
| output_max_length = 30 | |
| output_length_sampler = LengthSampler(output_min_length, output_max_length) | |
| model_save_path = script_args.model_save_path | |
| for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)): | |
| query_tensors = batch["input_ids"] | |
| # Get response from the policy model | |
| response_tensors = [] | |
| for query in query_tensors: | |
| gen_len = output_length_sampler() | |
| generation_kwargs["max_new_tokens"] = gen_len | |
| response = ppo_trainer.generate(query, **generation_kwargs) | |
| response_tensors.append(response.squeeze()[-gen_len:]) | |
| batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors] | |
| # Compute sentiment score | |
| texts = batch["response"] | |
| toxicity_inputs = toxicity_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to( | |
| ppo_trainer.accelerator.device | |
| ) | |
| logits = toxicity_model(**toxicity_inputs).logits.float() | |
| toxicity_labels = (logits[:, 0]).tolist() | |
| rewards = [torch.tensor(output) for output in toxicity_labels] | |
| # Run PPO step | |
| stats = ppo_trainer.step(query_tensors, response_tensors, rewards) | |
| ppo_trainer.log_stats(stats, batch, rewards) | |
| # Save model every 100 epochs | |
| if epoch % 100 == 0: | |
| if ppo_trainer.accelerator.is_main_process: | |
| ppo_trainer.save_pretrained(model_save_path) | |