Files changed (1) hide show
  1. app.py +0 -108
app.py DELETED
@@ -1,108 +0,0 @@
1
- import spaces
2
- import gradio as gr
3
- import torch
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
5
- from threading import Thread
6
- import traceback
7
-
8
- model_path = 'infly/OpenCoder-8B-Instruct'
9
-
10
- # Loading the tokenizer and model from Hugging Face's model hub.
11
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
12
- model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
13
-
14
- # using CUDA for an optimal experience
15
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16
- model = model.to(device)
17
-
18
- # Defining a custom stopping criteria class for the model's text generation.
19
- class StopOnTokens(StoppingCriteria):
20
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
21
- stop_ids = [96539] # IDs of tokens where the generation should stop.
22
- for stop_id in stop_ids:
23
- if input_ids[0][-1] == stop_id: # Checking if the last generated token is a stop token.
24
- return True
25
- return False
26
-
27
-
28
- system_role= 'system'
29
- user_role = 'user'
30
- assistant_role = "assistant"
31
-
32
- sft_start_token = "<|im_start|>"
33
- sft_end_token = "<|im_end|>"
34
- ct_end_token = "<|endoftext|>"
35
-
36
- # system_prompt= 'You are a CodeLLM developed by INF.'
37
-
38
-
39
- # Function to generate model predictions.
40
-
41
- @spaces.GPU()
42
- def predict(message, history):
43
-
44
- try:
45
- stop = StopOnTokens()
46
-
47
- model_messages = []
48
- # print(f'history: {history}')
49
-
50
- for i, item in enumerate(history):
51
- model_messages.append({"role": user_role, "content": item[0]})
52
- model_messages.append({"role": assistant_role, "content": item[1]})
53
-
54
- model_messages.append({"role": user_role, "content": message})
55
-
56
- print(f'model_messages: {model_messages}')
57
-
58
- # print(f'model_final_inputs: {tokenizer.apply_chat_template(model_messages, add_generation_prompt=True, tokenize=False)}', flush=True)
59
- model_inputs = tokenizer.apply_chat_template(model_messages, add_generation_prompt=True, return_tensors="pt").to(device)
60
- # model_inputs = tokenizer([messages], return_tensors="pt").to(device)
61
-
62
- streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
63
- generate_kwargs = dict(
64
- input_ids=model_inputs,
65
- streamer=streamer,
66
- max_new_tokens=1024,
67
- do_sample=False,
68
- stopping_criteria=StoppingCriteriaList([stop])
69
- )
70
-
71
- t = Thread(target=model.generate, kwargs=generate_kwargs)
72
- t.start() # Starting the generation in a separate thread.
73
- partial_message = ""
74
- for new_token in streamer:
75
- partial_message += new_token
76
- if sft_end_token in partial_message: # Breaking the loop if the stop token is generated.
77
- break
78
- yield partial_message
79
-
80
- except Exception as e:
81
- print(traceback.format_exc())
82
-
83
-
84
- css = """
85
- full-height {
86
- height: 100%;
87
- }
88
- """
89
-
90
- prompt_examples = [
91
- 'Write a quick sort algorithm in python.',
92
- 'Write a greedy snake game using pygame.',
93
- 'How to use numpy?'
94
- ]
95
-
96
- placeholder = """
97
- <div style="opacity: 0.5;">
98
- <img src="https://raw.githubusercontent.com/OpenCoder-llm/opencoder-llm.github.io/refs/heads/main/static/images/opencoder_icon.jpg" style="width:20%;">
99
- </div>
100
- """
101
-
102
-
103
- chatbot = gr.Chatbot(label='OpenCoder', placeholder=placeholder)
104
- with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
105
-
106
- gr.ChatInterface(predict, chatbot=chatbot, fill_height=True, examples=prompt_examples, css=css)
107
-
108
- demo.launch() # Launching the web interface.