Spaces:
Runtime error
Runtime error
| import os | |
| os.system('pip install torch datasets transformers') | |
| from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW | |
| from torch.utils.data import Dataset, DataLoader | |
| from datasets import load_dataset | |
| import torch | |
| # Load Ultrachat dataset | |
| dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") | |
| # Tokenization | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| class MyDataset(Dataset): | |
| def __init__(self, data, max_length=512): | |
| self.data = data | |
| self.max_length = max_length | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| # Extract relevant information from the user and assistant messages | |
| user_content = self.data[idx][0]['content'] if 'content' in self.data[idx][0] else "" | |
| assistant_content = self.data[idx][1]['content'] if 'content' in self.data[idx][1] else "" | |
| # Combine user and assistant messages into a single text | |
| text = f"User: {user_content} Assistant: {assistant_content}" | |
| # Tokenize the text without squeezing the tensor and convert to Long tensor | |
| input_ids = tokenizer.encode(text, return_tensors='pt').long() | |
| # Optionally truncate or pad the sequence to a maximum length | |
| input_ids = input_ids[:, :self.max_length] | |
| # If needed, pad the sequence to the max_length using torch.nn.functional.pad | |
| input_ids = torch.nn.functional.pad(input_ids, (0, self.max_length - input_ids.size(1)), 'constant', 0) | |
| return {'input_ids': input_ids} | |
| # Create DataLoader without collate_fn | |
| my_dataset = MyDataset(dataset['messages']) | |
| dataloader = DataLoader(my_dataset, batch_size=4, shuffle=True) | |
| # Load pre-trained model | |
| model = GPT2LMHeadModel.from_pretrained("gpt2") | |
| # Move model to GPU if available | |
| device = torch.device("cpu") | |
| model.to(device) | |
| # Define optimizer | |
| optimizer = AdamW(model.parameters(), lr=5e-5) | |
| # Fine-tuning Loop | |
| for epoch in range(1): | |
| total_loss = 0.0 | |
| for i, batch in enumerate(dataloader): | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| outputs = model(**batch, labels=batch['input_ids']) | |
| loss = outputs.loss | |
| loss.backward() | |
| optimizer.step() | |
| optimizer.zero_grad() | |
| total_loss += loss.item() | |
| if (i + 1) % 100 == 0: # Print loss every 100 batches | |
| average_loss = total_loss / 100 | |
| print(f"Epoch: {epoch + 1}, Batch: {i + 1}, Average Loss: {average_loss:.4f}") | |
| total_loss = 0.0 | |
| print("Training complete!") | |
| model.save_pretrained('/gpt2_finetuned') | |
| tokenizer.save_pretrained('/gpt2_finetuned/tokenizer') | |
| print("Model Saved! \n Enjoy the model Now!") | |