Spaces:

huggingface-projects
/

llama-2-7b-chat

Running on Zero

App Files Files Community

hysts HF Staff commited on Oct 4, 2023

Commit

09b3f75

1 Parent(s): 323df56

Migrate from yapf to black

Browse files

Files changed (3) hide show

README.md +0 -2
app.py +66 -67
model.py +20 -26

README.md CHANGED Viewed

@@ -17,5 +17,3 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 Llama v2 was introduced in [this paper](https://arxiv.org/abs/2307.09288).
 This Space demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/meta-llama/Llama-2-7b-chat-hf) from Meta. Please, check the original model card for details.


17	Llama v2 was introduced in [this paper](https://arxiv.org/abs/2307.09288).
18
19	This Space demonstrates [Llama-2-7b-chat-hf](https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/meta-llama/Llama-2-7b-chat-hf) from Meta. Please, check the original model card for details.

app.py CHANGED Viewed

@@ -33,26 +33,24 @@ this demo is governed by the original [license](https://huggingface.co/spaces/hu
 """
 if not torch.cuda.is_available():
-    DESCRIPTION += '\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>'
 def clear_and_save_textbox(message: str) -> tuple[str, str]:
-    return '', message
-def display_input(message: str,
-                  history: list[tuple[str, str]]) -> list[tuple[str, str]]:
-    history.append((message, ''))
     return history
-def delete_prev_fn(
-        history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
     try:
         message, _ = history.pop()
     except IndexError:
-        message = ''
-    return history, message or ''
 def generate(
@@ -73,7 +71,7 @@ def generate(
         first_response = next(generator)
         yield history + [(message, first_response)]
     except StopIteration:
-        yield history + [(message, '')]
     for response in generator:
         yield history + [(message, response)]
@@ -82,67 +80,63 @@ def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
     generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
     for x in generator:
         pass
-    return '', x
 def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
     input_token_length = get_input_token_length(message, chat_history, system_prompt)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
-        raise gr.Error(f'The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again.')
-with gr.Blocks(css='style.css') as demo:
     gr.Markdown(DESCRIPTION)
-    gr.DuplicateButton(value='Duplicate Space for private use',
-                       elem_id='duplicate-button')
     with gr.Group():
-        chatbot = gr.Chatbot(label='Chatbot')
         with gr.Row():
             textbox = gr.Textbox(
                 container=False,
                 show_label=False,
-                placeholder='Type a message...',
                 scale=10,
             )
-            submit_button = gr.Button('Submit',
-                                      variant='primary',
-                                      scale=1,
-                                      min_width=0)
     with gr.Row():
-        retry_button = gr.Button('🔄  Retry', variant='secondary')
-        undo_button = gr.Button('↩️ Undo', variant='secondary')
-        clear_button = gr.Button('🗑️  Clear', variant='secondary')
     saved_input = gr.State()
-    with gr.Accordion(label='Advanced options', open=False):
-        system_prompt = gr.Textbox(label='System prompt',
-                                   value=DEFAULT_SYSTEM_PROMPT,
-                                   lines=6)
         max_new_tokens = gr.Slider(
-            label='Max new tokens',
             minimum=1,
             maximum=MAX_MAX_NEW_TOKENS,
             step=1,
             value=DEFAULT_MAX_NEW_TOKENS,
         )
         temperature = gr.Slider(
-            label='Temperature',
             minimum=0.1,
             maximum=4.0,
             step=0.1,
             value=1.0,
         )
         top_p = gr.Slider(
-            label='Top-p (nucleus sampling)',
             minimum=0.05,
             maximum=1.0,
             step=0.05,
             value=0.95,
         )
         top_k = gr.Slider(
-            label='Top-k',
             minimum=1,
             maximum=1000,
             step=1,
@@ -151,10 +145,10 @@ with gr.Blocks(css='style.css') as demo:
     gr.Examples(
         examples=[
-            'Hello there! How are you doing?',
-            'Can you explain briefly to me what is the Python programming language?',
-            'Explain the plot of Cinderella in a sentence.',
-            'How many hours does it take a man to eat a Helicopter?',
             "Write a 100-word article on 'Benefits of Open-Source in AI research'",
         ],
         inputs=textbox,
@@ -197,36 +191,41 @@ with gr.Blocks(css='style.css') as demo:
         api_name=False,
     )
-    button_event_preprocess = submit_button.click(
-        fn=clear_and_save_textbox,
-        inputs=textbox,
-        outputs=[textbox, saved_input],
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=display_input,
-        inputs=[saved_input, chatbot],
-        outputs=chatbot,
-        api_name=False,
-        queue=False,
-    ).then(
-        fn=check_input_token_length,
-        inputs=[saved_input, chatbot, system_prompt],
-        api_name=False,
-        queue=False,
-    ).success(
-        fn=generate,
-        inputs=[
-            saved_input,
-            chatbot,
-            system_prompt,
-            max_new_tokens,
-            temperature,
-            top_p,
-            top_k,
-        ],
-        outputs=chatbot,
-        api_name=False,
     )
     retry_button.click(
@@ -271,7 +270,7 @@ with gr.Blocks(css='style.css') as demo:
     )
     clear_button.click(
-        fn=lambda: ([], ''),
         outputs=[chatbot, saved_input],
         queue=False,
         api_name=False,

 """
 if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 def clear_and_save_textbox(message: str) -> tuple[str, str]:
+    return "", message
+def display_input(message: str, history: list[tuple[str, str]]) -> list[tuple[str, str]]:
+    history.append((message, ""))
     return history
+def delete_prev_fn(history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
     try:
         message, _ = history.pop()
     except IndexError:
+        message = ""
+    return history, message or ""
 def generate(
         first_response = next(generator)
         yield history + [(message, first_response)]
     except StopIteration:
+        yield history + [(message, "")]
     for response in generator:
         yield history + [(message, response)]
     generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
     for x in generator:
         pass
+    return "", x
 def check_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> None:
     input_token_length = get_input_token_length(message, chat_history, system_prompt)
     if input_token_length > MAX_INPUT_TOKEN_LENGTH:
+        raise gr.Error(
+            f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
+        )
+with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
     with gr.Group():
+        chatbot = gr.Chatbot(label="Chatbot")
         with gr.Row():
             textbox = gr.Textbox(
                 container=False,
                 show_label=False,
+                placeholder="Type a message...",
                 scale=10,
             )
+            submit_button = gr.Button("Submit", variant="primary", scale=1, min_width=0)
     with gr.Row():
+        retry_button = gr.Button("🔄  Retry", variant="secondary")
+        undo_button = gr.Button("↩️ Undo", variant="secondary")
+        clear_button = gr.Button("🗑️  Clear", variant="secondary")
     saved_input = gr.State()
+    with gr.Accordion(label="Advanced options", open=False):
+        system_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6)
         max_new_tokens = gr.Slider(
+            label="Max new tokens",
             minimum=1,
             maximum=MAX_MAX_NEW_TOKENS,
             step=1,
             value=DEFAULT_MAX_NEW_TOKENS,
         )
         temperature = gr.Slider(
+            label="Temperature",
             minimum=0.1,
             maximum=4.0,
             step=0.1,
             value=1.0,
         )
         top_p = gr.Slider(
+            label="Top-p (nucleus sampling)",
             minimum=0.05,
             maximum=1.0,
             step=0.05,
             value=0.95,
         )
         top_k = gr.Slider(
+            label="Top-k",
             minimum=1,
             maximum=1000,
             step=1,
     gr.Examples(
         examples=[
+            "Hello there! How are you doing?",
+            "Can you explain briefly to me what is the Python programming language?",
+            "Explain the plot of Cinderella in a sentence.",
+            "How many hours does it take a man to eat a Helicopter?",
             "Write a 100-word article on 'Benefits of Open-Source in AI research'",
         ],
         inputs=textbox,
         api_name=False,
     )
+    button_event_preprocess = (
+        submit_button.click(
+            fn=clear_and_save_textbox,
+            inputs=textbox,
+            outputs=[textbox, saved_input],
+            api_name=False,
+            queue=False,
+        )
+        .then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        )
+        .then(
+            fn=check_input_token_length,
+            inputs=[saved_input, chatbot, system_prompt],
+            api_name=False,
+            queue=False,
+        )
+        .success(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            ],
+            outputs=chatbot,
+            api_name=False,
+        )
     )
     retry_button.click(
     )
     clear_button.click(
+        fn=lambda: ([], ""),
         outputs=[chatbot, saved_input],
         queue=False,
         api_name=False,

model.py CHANGED Viewed

@@ -4,53 +4,47 @@ from typing import Iterator
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-model_id = 'meta-llama/Llama-2-7b-chat-hf'
 if torch.cuda.is_available():
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch.float16,
-        device_map='auto'
-    )
 else:
     model = None
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-def get_prompt(message: str, chat_history: list[tuple[str, str]],
-               system_prompt: str) -> str:
-    texts = [f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
     # The first user input is _not_ stripped
     do_strip = False
     for user_input, response in chat_history:
         user_input = user_input.strip() if do_strip else user_input
         do_strip = True
-        texts.append(f'{user_input} [/INST] {response.strip()} </s><s>[INST] ')
     message = message.strip() if do_strip else message
-    texts.append(f'{message} [/INST]')
-    return ''.join(texts)
 def get_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
     prompt = get_prompt(message, chat_history, system_prompt)
-    input_ids = tokenizer([prompt], return_tensors='np', add_special_tokens=False)['input_ids']
     return input_ids.shape[-1]
-def run(message: str,
-        chat_history: list[tuple[str, str]],
-        system_prompt: str,
-        max_new_tokens: int = 1024,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        top_k: int = 50) -> Iterator[str]:
     prompt = get_prompt(message, chat_history, system_prompt)
-    inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
-    streamer = TextIteratorStreamer(tokenizer,
-                                    timeout=10.,
-                                    skip_prompt=True,
-                                    skip_special_tokens=True)
     generate_kwargs = dict(
         inputs,
         streamer=streamer,
@@ -67,4 +61,4 @@ def run(message: str,
     outputs = []
     for text in streamer:
         outputs.append(text)
-        yield ''.join(outputs)

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+model_id = "meta-llama/Llama-2-7b-chat-hf"
 if torch.cuda.is_available():
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
 else:
     model = None
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+def get_prompt(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> str:
+    texts = [f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
     # The first user input is _not_ stripped
     do_strip = False
     for user_input, response in chat_history:
         user_input = user_input.strip() if do_strip else user_input
         do_strip = True
+        texts.append(f"{user_input} [/INST] {response.strip()} </s><s>[INST] ")
     message = message.strip() if do_strip else message
+    texts.append(f"{message} [/INST]")
+    return "".join(texts)
 def get_input_token_length(message: str, chat_history: list[tuple[str, str]], system_prompt: str) -> int:
     prompt = get_prompt(message, chat_history, system_prompt)
+    input_ids = tokenizer([prompt], return_tensors="np", add_special_tokens=False)["input_ids"]
     return input_ids.shape[-1]
+def run(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.8,
+    top_p: float = 0.95,
+    top_k: int = 50,
+) -> Iterator[str]:
     prompt = get_prompt(message, chat_history, system_prompt)
+    inputs = tokenizer([prompt], return_tensors="pt", add_special_tokens=False).to("cuda")
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         inputs,
         streamer=streamer,
     outputs = []
     for text in streamer:
         outputs.append(text)
+        yield "".join(outputs)