support model registration

Browse files

Files changed (5) hide show

README.md +18 -7
__init__.py +0 -0
inference_example.py +65 -0
inference_example2.py +62 -0
modeling_dolphin.py +1 -1

README.md CHANGED Viewed

@@ -42,11 +42,24 @@ Dolphin employs a decoder-decoder framework with two main components:
 ![Model Architecture](modelstructure.jpg)
 ## Running the Model
 ```python
-from transformers import AutoTokenizer
-from configuration_dolphin import DolphinForCausalLM
-import time
 def inference_instruct(mycontext, question, device="cuda:0"):
     import time
@@ -90,13 +103,11 @@ def inference_instruct(mycontext, question, device="cuda:0"):
 if __name__ == "__main__":
-    # Register your configuration and model
     AutoConfig.register("dolphin", DolphinConfig)
     AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
-    device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
     # Load the tokenizer and model
-    tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
     # Run inference example

 ![Model Architecture](modelstructure.jpg)
 ## Running the Model
+Method 1 : download this repository and run the following commands:
+```bash
+git lfs install
+git clone https://huggingface.co/NexaAIDev/Dolphin
+python inference_example.py
+```
+Method 2 : install `dolphin` package
+```
+pip install nexaai-dolphin
+```
+Then run the following commands:
 ```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+import torch
+from dolphin.configuration_dolphin import DolphinConfig
+from dolphin.modeling_dolphin import DolphinForCausalLM
 def inference_instruct(mycontext, question, device="cuda:0"):
     import time
 if __name__ == "__main__":
+    device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
     AutoConfig.register("dolphin", DolphinConfig)
     AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
     # Load the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin')
     model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
     # Run inference example

__init__.py ADDED Viewed

File without changes

inference_example.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from configuration_dolphin import DolphinConfig
+from modeling_dolphin import DolphinForCausalLM
+from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoConfig)
+import torch
+def inference_instruct(mycontext, question, device="cuda:0"):
+    import time
+    MEMORY_SIZE = 32
+    start_time = time.time()
+    generated_token_ids = []
+    prompt = f" <context>{question}"
+    text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
+    input_ids = (
+        torch.tensor(
+            text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
+        )
+        .unsqueeze(0)
+        .to(device)
+    )
+    # to process the context
+    context_tokenized = tokenizer(
+        mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
+        return_tensors="pt",
+    )
+    context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
+    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
+    # We conduct a inference process
+    for i in range(context_token_count):
+        next_token = (
+            model(
+                input_ids,
+                context_input_ids=context_tokenized["input_ids"],
+                context_attention_mask=context_tokenized["attention_mask"],
+            )
+            .logits[:, -1]
+            .argmax(-1)
+        )
+        if next_token.item() == 151643:
+            break
+        generated_token_ids.append(next_token.item())
+        input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
+    result = tokenizer.decode(generated_token_ids)
+    print(f"Time taken: {time.time() - start_time}")
+    return result
+if __name__ == "__main__":
+    # Register your configuration and model
+    AutoConfig.register("dolphin", DolphinConfig)
+    AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
+    device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
+    # Load the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
+    # Run inference example
+    mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
+    question = "Who founded Nexa AI?"
+    # Pass the context and the correct device string
+    result = inference_instruct(mycontext, question, device=device_name)
+    print("Result:", result)

inference_example2.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+import torch
+# !pip install nexaai-dolphin
+from dolphin.configuration_dolphin import DolphinConfig
+from dolphin.modeling_dolphin import DolphinForCausalLM
+def inference_instruct(mycontext, question, device="cuda:0"):
+    import time
+    MEMORY_SIZE = 32
+    start_time = time.time()
+    generated_token_ids = []
+    prompt = f" <context>{question}"
+    text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
+    input_ids = (
+        torch.tensor(
+            text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
+        )
+        .unsqueeze(0)
+        .to(device)
+    )
+    # to process the context
+    context_tokenized = tokenizer(
+        mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
+        return_tensors="pt",
+    )
+    context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
+    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
+    # We conduct a inference process
+    for i in range(context_token_count):
+        next_token = (
+            model(
+                input_ids,
+                context_input_ids=context_tokenized["input_ids"],
+                context_attention_mask=context_tokenized["attention_mask"],
+            )
+            .logits[:, -1]
+            .argmax(-1)
+        )
+        if next_token.item() == 151643:
+            break
+        generated_token_ids.append(next_token.item())
+        input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
+    result = tokenizer.decode(generated_token_ids)
+    print(f"Time taken: {time.time() - start_time}")
+    return result
+if __name__ == "__main__":
+    device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
+    AutoConfig.register("dolphin", DolphinConfig)
+    AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
+    # Load the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin')
+    model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
+    # Run inference example
+    mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
+    question = "Who founded Nexa AI?"
+    # Pass the context and the correct device string
+    result = inference_instruct(mycontext, question, device=device_name)
+    print("Result:", result)

modeling_dolphin.py CHANGED Viewed

@@ -22,7 +22,7 @@ from typing import List, Optional, Tuple, Union
 import warnings
 from dataclasses import dataclass
 from torch.nn import CrossEntropyLoss
-from .configuration_dolphin import encoder_config_dict, DolphinConfig
 CONTEXT_EMB = 896  # Qwen 0.7B has dimension of 896
 HIDDEN_EMB = 3584  # Qwen 7B has dimension of 3584

 import warnings
 from dataclasses import dataclass
 from torch.nn import CrossEntropyLoss
+from configuration_dolphin import encoder_config_dict, DolphinConfig
 CONTEXT_EMB = 896  # Qwen 0.7B has dimension of 896
 HIDDEN_EMB = 3584  # Qwen 7B has dimension of 3584