Spaces:

suryadev1
/

astra

Sleeping

App Files Files Community

astra / hint_fine_tuning.py

suryadev1

fine

cecfca1 about 1 year ago

raw

history blame

15.5 kB

	import argparse
	import os
	import sys
	import torch
	import torch.nn as nn
	from torch.utils.data import DataLoader, random_split, TensorDataset
	from src.dataset import TokenizerDataset
	from src.bert import BERT
	from src.pretrainer import BERTFineTuneTrainer1
	from src.vocab import Vocab
	import pandas as pd


	# class CustomBERTModel(nn.Module):
	# def __init__(self, vocab_size, output_dim, pre_trained_model_path):
	# super(CustomBERTModel, self).__init__()
	# hidden_size = 768
	# self.bert = BERT(vocab_size=vocab_size, hidden=hidden_size, n_layers=12, attn_heads=12, dropout=0.1)
	# checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu'))
	# if isinstance(checkpoint, dict):
	# self.bert.load_state_dict(checkpoint)
	# elif isinstance(checkpoint, BERT):
	# self.bert = checkpoint
	# else:
	# raise TypeError(f"Expected state_dict or BERT instance, got {type(checkpoint)} instead.")
	# self.fc = nn.Linear(hidden_size, output_dim)

	# def forward(self, sequence, segment_info):
	# sequence = sequence.to(next(self.parameters()).device)
	# segment_info = segment_info.to(sequence.device)

	# if sequence.size(0) == 0 or sequence.size(1) == 0:
	# raise ValueError("Input sequence tensor has 0 elements. Check data preprocessing.")

	# x = self.bert(sequence, segment_info)
	# print(f"BERT output shape: {x.shape}")

	# if x.size(0) == 0 or x.size(1) == 0:
	# raise ValueError("BERT output tensor has 0 elements. Check input dimensions.")

	# cls_embeddings = x[:, 0]
	# logits = self.fc(cls_embeddings)
	# return logits

	# class CustomBERTModel(nn.Module):
	# def __init__(self, vocab_size, output_dim, pre_trained_model_path):
	# super(CustomBERTModel, self).__init__()
	# hidden_size = 764 # Ensure this is 768
	# self.bert = BERT(vocab_size=vocab_size, hidden=hidden_size, n_layers=12, attn_heads=12, dropout=0.1)

	# # Load the pre-trained model's state_dict
	# checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu'))
	# if isinstance(checkpoint, dict):
	# self.bert.load_state_dict(checkpoint)
	# else:
	# raise TypeError(f"Expected state_dict, got {type(checkpoint)} instead.")

	# # Fully connected layer with input size 768
	# self.fc = nn.Linear(hidden_size, output_dim)

	# def forward(self, sequence, segment_info):
	# sequence = sequence.to(next(self.parameters()).device)
	# segment_info = segment_info.to(sequence.device)

	# x = self.bert(sequence, segment_info)
	# print(f"BERT output shape: {x.shape}") # Should output (batch_size, seq_len, 768)

	# cls_embeddings = x[:, 0] # Extract CLS token embeddings
	# print(f"CLS Embeddings shape: {cls_embeddings.shape}") # Should output (batch_size, 768)

	# logits = self.fc(cls_embeddings) # Should now pass a tensor of size (batch_size, 768) to `fc`

	# return logits


	# for test
	class CustomBERTModel(nn.Module):
	def __init__(self, vocab_size, output_dim, pre_trained_model_path):
	super(CustomBERTModel, self).__init__()
	self.hidden = 764 # Ensure this is defined correctly
	self.bert = BERT(vocab_size=vocab_size, hidden=self.hidden, n_layers=12, attn_heads=12, dropout=0.1)

	# Load the pre-trained model's state_dict
	checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu'))
	if isinstance(checkpoint, dict):
	self.bert.load_state_dict(checkpoint)
	else:
	raise TypeError(f"Expected state_dict, got {type(checkpoint)} instead.")

	self.fc = nn.Linear(self.hidden, output_dim)

	def forward(self, sequence, segment_info):
	x = self.bert(sequence, segment_info)
	cls_embeddings = x[:, 0] # Extract CLS token embeddings
	logits = self.fc(cls_embeddings) # Pass to fully connected layer
	return logits

	def preprocess_labels(label_csv_path):
	try:
	labels_df = pd.read_csv(label_csv_path)
	labels = labels_df['last_hint_class'].values.astype(int)
	return torch.tensor(labels, dtype=torch.long)
	except Exception as e:
	print(f"Error reading dataset file: {e}")
	return None


	def preprocess_data(data_path, vocab, max_length=128):
	try:
	with open(data_path, 'r') as f:
	sequences = f.readlines()
	except Exception as e:
	print(f"Error reading data file: {e}")
	return None, None

	if len(sequences) == 0:
	raise ValueError(f"No sequences found in data file {data_path}. Check the file content.")

	tokenized_sequences = []

	for sequence in sequences:
	sequence = sequence.strip()
	if sequence:
	encoded = vocab.to_seq(sequence, seq_len=max_length)
	encoded = encoded[:max_length] + [vocab.vocab.get('[PAD]', 0)] * (max_length - len(encoded))
	segment_label = [0] * max_length

	tokenized_sequences.append({
	'input_ids': torch.tensor(encoded),
	'segment_label': torch.tensor(segment_label)
	})

	if not tokenized_sequences:
	raise ValueError("Tokenization resulted in an empty list. Check the sequences and tokenization logic.")

	tokenized_sequences = [t for t in tokenized_sequences if len(t['input_ids']) == max_length]

	if not tokenized_sequences:
	raise ValueError("All tokenized sequences are of unexpected length. This suggests an issue with the tokenization logic.")

	input_ids = torch.cat([t['input_ids'].unsqueeze(0) for t in tokenized_sequences], dim=0)
	segment_labels = torch.cat([t['segment_label'].unsqueeze(0) for t in tokenized_sequences], dim=0)

	print(f"Input IDs shape: {input_ids.shape}")
	print(f"Segment labels shape: {segment_labels.shape}")

	return input_ids, segment_labels


	def collate_fn(batch):
	inputs = []
	labels = []
	segment_labels = []

	for item in batch:
	if item is None:
	continue

	if isinstance(item, dict):
	inputs.append(item['input_ids'].unsqueeze(0))
	labels.append(item['label'].unsqueeze(0))
	segment_labels.append(item['segment_label'].unsqueeze(0))

	if len(inputs) == 0 or len(segment_labels) == 0:
	print("Empty batch encountered. Returning None to skip this batch.")
	return None

	try:
	inputs = torch.cat(inputs, dim=0)
	labels = torch.cat(labels, dim=0)
	segment_labels = torch.cat(segment_labels, dim=0)
	except Exception as e:
	print(f"Error concatenating tensors: {e}")
	return None

	return {
	'input': inputs,
	'label': labels,
	'segment_label': segment_labels
	}

	def custom_collate_fn(batch):
	processed_batch = collate_fn(batch)

	if processed_batch is None or len(processed_batch['input']) == 0:
	# Return a valid batch with at least one element instead of an empty one
	return {
	'input': torch.zeros((1, 128), dtype=torch.long),
	'label': torch.zeros((1,), dtype=torch.long),
	'segment_label': torch.zeros((1, 128), dtype=torch.long)
	}

	return processed_batch


	def train_without_progress_status(trainer, epoch, shuffle):
	for epoch_idx in range(epoch):
	print(f"EP_train:{epoch_idx}:")
	for batch in trainer.train_data:
	if batch is None:
	continue

	# Check if batch is a string (indicating an issue)
	if isinstance(batch, str):
	print(f"Error: Received a string instead of a dictionary in batch: {batch}")
	raise ValueError(f"Unexpected string in batch: {batch}")

	# Validate the batch structure before passing to iteration
	if isinstance(batch, dict):
	# Verify that all expected keys are present and that the values are tensors
	if all(key in batch for key in ['input_ids', 'segment_label', 'labels']):
	if all(isinstance(batch[key], torch.Tensor) for key in batch):
	try:
	print(f"Batch Structure: {batch}") # Debugging batch before iteration
	trainer.iteration(epoch_idx, batch)
	except Exception as e:
	print(f"Error during batch processing: {e}")
	sys.stdout.flush()
	raise e # Propagate the exception for better debugging
	else:
	print(f"Error: Expected all values in batch to be tensors, but got: {batch}")
	raise ValueError("Batch contains non-tensor values.")
	else:
	print(f"Error: Batch missing expected keys. Batch keys: {batch.keys()}")
	raise ValueError("Batch does not contain expected keys.")
	else:
	print(f"Error: Expected batch to be a dictionary but got {type(batch)} instead.")
	raise ValueError(f"Invalid batch structure: {batch}")

	# def main(opt):
	# # device = torch.device("cpu")
	# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# vocab = Vocab(opt.vocab_file)
	# vocab.load_vocab()

	# input_ids, segment_labels = preprocess_data(opt.data_path, vocab, max_length=128)
	# labels = preprocess_labels(opt.dataset)

	# if input_ids is None or segment_labels is None or labels is None:
	# print("Error in preprocessing data. Exiting.")
	# return

	# dataset = TensorDataset(input_ids, segment_labels, torch.tensor(labels, dtype=torch.long))
	# val_size = len(dataset) - int(0.8 * len(dataset))
	# val_dataset, train_dataset = random_split(dataset, [val_size, len(dataset) - val_size])

	# train_dataloader = DataLoader(
	# train_dataset,
	# batch_size=32,
	# shuffle=True,
	# collate_fn=custom_collate_fn
	# )
	# val_dataloader = DataLoader(
	# val_dataset,
	# batch_size=32,
	# shuffle=False,
	# collate_fn=custom_collate_fn
	# )

	# custom_model = CustomBERTModel(
	# vocab_size=len(vocab.vocab),
	# output_dim=2,
	# pre_trained_model_path=opt.pre_trained_model_path
	# ).to(device)

	# trainer = BERTFineTuneTrainer1(
	# bert=custom_model.bert,
	# vocab_size=len(vocab.vocab),
	# train_dataloader=train_dataloader,
	# test_dataloader=val_dataloader,
	# lr=5e-5,
	# num_labels=2,
	# with_cuda=torch.cuda.is_available(),
	# log_freq=10,
	# workspace_name=opt.output_dir,
	# log_folder_path=opt.log_folder_path
	# )

	# trainer.train(epoch=20)

	# # os.makedirs(opt.output_dir, exist_ok=True)
	# # output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model.pth')
	# # torch.save(custom_model.state_dict(), output_model_file)
	# # print(f'Model saved to {output_model_file}')

	# os.makedirs(opt.output_dir, exist_ok=True)
	# output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model_2.pth')
	# torch.save(custom_model, output_model_file)
	# print(f'Model saved to {output_model_file}')


	def main(opt):
	# Set device to GPU if available, otherwise use CPU
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	print(torch.cuda.is_available()) # Should return True if GPU is available
	print(torch.cuda.device_count())

	# Load vocabulary
	vocab = Vocab(opt.vocab_file)
	vocab.load_vocab()

	# Preprocess data and labels
	input_ids, segment_labels = preprocess_data(opt.data_path, vocab, max_length=128)
	labels = preprocess_labels(opt.dataset)

	if input_ids is None or segment_labels is None or labels is None:
	print("Error in preprocessing data. Exiting.")
	return

	# Transfer tensors to the correct device (GPU/CPU)
	input_ids = input_ids.to(device)
	segment_labels = segment_labels.to(device)
	labels = torch.tensor(labels, dtype=torch.long).to(device)

	# Create TensorDataset and split into train and validation sets
	dataset = TensorDataset(input_ids, segment_labels, labels)
	val_size = len(dataset) - int(0.8 * len(dataset))
	val_dataset, train_dataset = random_split(dataset, [val_size, len(dataset) - val_size])

	# Create DataLoaders for training and validation
	train_dataloader = DataLoader(
	train_dataset,
	batch_size=32,
	shuffle=True,
	collate_fn=custom_collate_fn
	)
	val_dataloader = DataLoader(
	val_dataset,
	batch_size=32,
	shuffle=False,
	collate_fn=custom_collate_fn
	)

	# Initialize custom BERT model and move it to the device
	custom_model = CustomBERTModel(
	vocab_size=len(vocab.vocab),
	output_dim=2,
	pre_trained_model_path=opt.pre_trained_model_path
	).to(device)

	# Initialize the fine-tuning trainer
	trainer = BERTFineTuneTrainer1(
	bert=custom_model.bert,
	vocab_size=len(vocab.vocab),
	train_dataloader=train_dataloader,
	test_dataloader=val_dataloader,
	lr=5e-5,
	num_labels=2,
	with_cuda=torch.cuda.is_available(),
	log_freq=10,
	workspace_name=opt.output_dir,
	log_folder_path=opt.log_folder_path
	)

	# Train the model
	trainer.train(epoch=20)

	# Save the model to the specified output directory
	# os.makedirs(opt.output_dir, exist_ok=True)
	# output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model_2.pth')
	# torch.save(custom_model.state_dict(), output_model_file)
	# print(f'Model saved to {output_model_file}')
	os.makedirs(opt.output_dir, exist_ok=True)
	output_model_file = os.path.join(opt.output_dir, 'fine_tuned_model_2.pth')
	torch.save(custom_model, output_model_file)
	print(f'Model saved to {output_model_file}')


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Fine-tune BERT model.')
	parser.add_argument('--dataset', type=str, default='/home/jupyter/bert/dataset/hint_based/ratio_proportion_change_3/er/er_train.csv', help='Path to the dataset file.')
	parser.add_argument('--data_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/gt/er.txt', help='Path to the input sequence file.')
	parser.add_argument('--output_dir', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/output/hint_classification', help='Directory to save the fine-tuned model.')
	parser.add_argument('--pre_trained_model_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/output/pretrain:1800ms:64hs:4l:8a:50s:64b:1000e:-5lr/bert_trained.seq_encoder.model.ep68', help='Path to the pre-trained BERT model.')
	parser.add_argument('--vocab_file', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/_Aug23/pretraining/vocab.txt', help='Path to the vocabulary file.')
	parser.add_argument('--log_folder_path', type=str, default='/home/jupyter/bert/ratio_proportion_change3_1920/logs/oct_logs', help='Path to the folder for saving logs.')


	opt = parser.parse_args()
	main(opt)