|  | import torch | 
					
						
						|  | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | 
					
						
						|  |  | 
					
						
						|  | from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model | 
					
						
						|  |  | 
					
						
						|  | model_id = "/share/models/open-zharfa" | 
					
						
						|  |  | 
					
						
						|  | tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True) | 
					
						
						|  |  | 
					
						
						|  | base_model = AutoModelForCausalLM.from_pretrained( | 
					
						
						|  | model_id, | 
					
						
						|  | low_cpu_mem_usage=True, | 
					
						
						|  | return_dict=True, | 
					
						
						|  | torch_dtype=torch.float16, | 
					
						
						|  | device_map="auto", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | base_model.generation_config.do_sample = True | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | tokenizer.pad_token = tokenizer.unk_token | 
					
						
						|  | tokenizer.padding_side = "right" | 
					
						
						|  |  | 
					
						
						|  | def get_completion_merged(query: str, model, tokenizer) -> str: | 
					
						
						|  | device = "cuda:0" | 
					
						
						|  |  | 
					
						
						|  | prompt_template = """ | 
					
						
						|  |  | 
					
						
						|  | GPT4 Correct User: {query}<|end_of_turn|>GPT4 Correct Assistant: | 
					
						
						|  |  | 
					
						
						|  | """ | 
					
						
						|  | prompt = prompt_template.format(query=query) | 
					
						
						|  |  | 
					
						
						|  | encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) | 
					
						
						|  |  | 
					
						
						|  | model_inputs = encodeds.to(device) | 
					
						
						|  |  | 
					
						
						|  | generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, temperature=0.5, pad_token_id=tokenizer.unk_token_id) | 
					
						
						|  | decoded = tokenizer.batch_decode(generated_ids) | 
					
						
						|  | return (decoded[0]) | 
					
						
						|  |  | 
					
						
						|  | while True: | 
					
						
						|  | q = input('q : ') | 
					
						
						|  | result = get_completion_merged(query=q, model=base_model, tokenizer=tokenizer) | 
					
						
						|  | print(result) | 
					
						
						|  |  |