Commit 
							
							·
						
						150c826
	
1
								Parent(s):
							
							1b7c795
								
Upload preprocessing.py
Browse files- preprocessing.py +24 -0
 
    	
        preprocessing.py
    ADDED
    
    | 
         @@ -0,0 +1,24 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            MAX_INPUT_LENGTH = 256
         
     | 
| 2 | 
         
            +
            MAX_TARGET_LENGTH = 128
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            def preprocess_function(examples):    
         
     | 
| 5 | 
         
            +
                """
         
     | 
| 6 | 
         
            +
                Preprocess entries of the given dataset (should be used with a `map` function)
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
                Params:
         
     | 
| 9 | 
         
            +
                    examples (Dataset): dataset to be preprocessed
         
     | 
| 10 | 
         
            +
                Returns:
         
     | 
| 11 | 
         
            +
                    model_inputs (BatchEncoding): tokenized dataset entries
         
     | 
| 12 | 
         
            +
                """
         
     | 
| 13 | 
         
            +
                inputs, targets = [], []
         
     | 
| 14 | 
         
            +
                for i in range(len(examples['question'])):
         
     | 
| 15 | 
         
            +
                    inputs.append(f"Answer: {examples['provided_answer'][i]} Reference: {examples['reference_answer'][i]} Question: {examples['question'][i]}")
         
     | 
| 16 | 
         
            +
                    targets.append(f"{examples['verification_feedback'][i]} Feedback: {examples['answer_feedback'][i]}")
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
                # apply tokenization to inputs and labels
         
     | 
| 19 | 
         
            +
                model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
         
     | 
| 20 | 
         
            +
                labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
                model_inputs['labels'] = labels['input_ids']
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
                return model_inputs
         
     |