IoannisKat1 commited on
Commit
6b2391b
·
verified ·
1 Parent(s): b4f1dac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -25
app.py CHANGED
@@ -20,31 +20,29 @@ storage_context = StorageContext.from_defaults(persist_dir=dir)
20
  index = load_index_from_storage(storage_context)
21
  query_engine = index.as_query_engine(similarity_top_k=8)
22
 
23
- def generate_response(instruction,chat_history):
24
- """Generates a response using your fine-tuned model."""
25
- # FastLanguageModel.for_inference(model) # Enable native 2x faster inference within the function
26
- prompt = f"""### Instruction:
27
- Answer the following question.
28
-
29
- ### Question:
30
- {instruction}
31
-
32
- Provide a unique, concise, and non-repetitive answer.
33
- ### Answer:"""
34
-
35
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
36
- with torch.no_grad():
37
- outputs = model.generate(**inputs,early_stopping=True,min_length=50,length_penalty=2,do_sample=True,max_new_tokens=300,
38
- top_p=0.95,
39
- top_k=50,
40
- temperature=0.7,
41
- repetition_penalty=1.2,
42
- num_return_sequences=1
43
- )
44
-
45
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
- response = response.split("### Answer:")[-1]
47
- return response
48
 
49
  def generate_response_rag_index(instruction,chat_history):
50
  response = query_engine.query(instruction)
 
20
  index = load_index_from_storage(storage_context)
21
  query_engine = index.as_query_engine(similarity_top_k=8)
22
 
23
+ # def generate_response(instruction,chat_history):
24
+ # """Generates a response using your fine-tuned model."""
25
+ # # FastLanguageModel.for_inference(model) # Enable native 2x faster inference within the function
26
+ # prompt = f"""### Instruction:
27
+ # Answer the following question.
28
+ # ### Question:
29
+ # {instruction}
30
+ # Provide a unique, concise, and non-repetitive answer.
31
+ # ### Answer:"""
32
+
33
+ # inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
34
+ # with torch.no_grad():
35
+ # outputs = model.generate(**inputs,early_stopping=True,min_length=50,length_penalty=2,do_sample=True,max_new_tokens=300,
36
+ # top_p=0.95,
37
+ # top_k=50,
38
+ # temperature=0.7,
39
+ # repetition_penalty=1.2,
40
+ # num_return_sequences=1
41
+ # )
42
+
43
+ # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
44
+ # response = response.split("### Answer:")[-1]
45
+ # return response
 
 
46
 
47
  def generate_response_rag_index(instruction,chat_history):
48
  response = query_engine.query(instruction)