Spaces:

IoannisKat1
/

AILA_Workspace_v2

Sleeping

IoannisKat1 commited on May 25

Commit

6b2391b

verified ·

1 Parent(s): b4f1dac

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -20,31 +20,29 @@ storage_context = StorageContext.from_defaults(persist_dir=dir)
 index = load_index_from_storage(storage_context)
 query_engine = index.as_query_engine(similarity_top_k=8)
-def generate_response(instruction,chat_history):
-    """Generates a response using your fine-tuned model."""
-    # FastLanguageModel.for_inference(model) # Enable native 2x faster inference within the function
-    prompt = f"""### Instruction:
-    Answer the following question.
-    ### Question:
-    {instruction}
-    Provide a unique, concise, and non-repetitive answer.
-    ### Answer:"""
-    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-    with torch.no_grad():
-        outputs = model.generate(**inputs,early_stopping=True,min_length=50,length_penalty=2,do_sample=True,max_new_tokens=300,
-    top_p=0.95,
-    top_k=50,
-    temperature=0.7,
-    repetition_penalty=1.2,
-    num_return_sequences=1
-    )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    response = response.split("### Answer:")[-1]
-    return response
 def generate_response_rag_index(instruction,chat_history):
     response = query_engine.query(instruction)

 index = load_index_from_storage(storage_context)
 query_engine = index.as_query_engine(similarity_top_k=8)
+# def generate_response(instruction,chat_history):
+#     """Generates a response using your fine-tuned model."""
+#     # FastLanguageModel.for_inference(model) # Enable native 2x faster inference within the function
+#     prompt = f"""### Instruction:
+#     Answer the following question.
+#     ### Question:
+#     {instruction}
+#     Provide a unique, concise, and non-repetitive answer.
+#     ### Answer:"""
+#     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+#     with torch.no_grad():
+#         outputs = model.generate(**inputs,early_stopping=True,min_length=50,length_penalty=2,do_sample=True,max_new_tokens=300,
+#     top_p=0.95,
+#     top_k=50,
+#     temperature=0.7,
+#     repetition_penalty=1.2,
+#     num_return_sequences=1
+#     )
+#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+#     response = response.split("### Answer:")[-1]
+#     return response
 def generate_response_rag_index(instruction,chat_history):
     response = query_engine.query(instruction)