# from unsloth import FastLanguageModel
# import torch
import os
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.indices.loading import load_index_from_storage
from openai import OpenAI
import gradio as gr

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# model,tokenizer = FastLanguageModel.from_pretrained('./unified_model')
# client = FastLanguageModel.for_inference(model)

dir = 'aila_indices'

# Initialize OpenAI client
client = OpenAI()
storage_context = StorageContext.from_defaults(persist_dir=dir)
index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine(similarity_top_k=8)

# def generate_response(instruction,chat_history):
#     """Generates a response using your fine-tuned model."""
#     # FastLanguageModel.for_inference(model) # Enable native 2x faster inference within the function
#     prompt = f"""### Instruction:
#     Answer the following question. 
#     ### Question:
#     {instruction}
#     Provide a unique, concise, and non-repetitive answer.
#     ### Answer:"""

#     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
#     with torch.no_grad():
#         outputs = model.generate(**inputs,early_stopping=True,min_length=50,length_penalty=2,do_sample=True,max_new_tokens=300,
#     top_p=0.95,
#     top_k=50,
#     temperature=0.7,
#     repetition_penalty=1.2,
#     num_return_sequences=1
#     )

#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     response = response.split("### Answer:")[-1]
#     return response

def generate_response_rag_index(instruction,chat_history):
    response = query_engine.query(instruction)
    return str(response)


def update_chat_history(chat_history, user_message, bot_message):
    """Update chat history to maintain relevance and avoid excessive growth."""
    chat_history['user'].append(user_message)
    chat_history['bot'].append(bot_message)
    # Keep only the last N interactions
    if len(chat_history['user']) > 5:
        chat_history['user'] = chat_history['user'][-5:]
        chat_history['bot'] = chat_history['bot'][-5:]
    return chat_history

def chatbot(input_text,chat_history):
  messages = {
      "user": [],
      "bot": [],
  }

  for user_msg, bot_msg in chat_history:
    messages["user"].append(user_msg)
    messages["bot"].append(bot_msg)

  # bot_response = generate_response(input_text,messages)
  bot_response = generate_response_rag_index(input_text,messages)
  chat_history.append(("User: " + input_text, bot_response))
  messages = update_chat_history(messages, input_text, bot_response)
  return "", chat_history

with gr.Blocks() as demo:
  gr.Markdown('## AILA INTERFACE DEMO')

  with gr.Row():
    
    gr.Image(value="up_2017_logo_en.png", interactive=False, label="Upatras Logo",width=150,height=100)
    gr.Image(value="aila_new.png", interactive=False, label="AILA project Logo",width=150,height=100)
    gr.Image(value="banner-horizontal-default-en.png", interactive=False, label="AUTH Logo",width=150,height=100)
    
  with gr.Row():

    user_input = gr.Textbox(
        placeholder = "Type your message here...",
        label = "Your Message",
        lines = 1
    )

  submit_button = gr.Button('Submit')

  chat_history = gr.Chatbot()


  submit_button.click(
      chatbot,
      inputs = [user_input,chat_history],
      outputs = [user_input, chat_history]
  )


demo.launch()