Spaces:

kouki321
/

cag_new_model

Sleeping

kouki321 commited on May 28

Commit

22c9862

verified ·

1 Parent(s): 175a15e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -74,20 +74,28 @@ def calculate_cache_size(cache):
     return total_memory /(1024*1024)
 @st.cache_resource
-def load_model_and_tokenizer(doc_text_count):
-    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name,
-        trust_remote_code=True,
-        model_max_length=1.3*round(doc_text_count * 0.3 + 1)
     )
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto",
-        trust_remote_code=True
     )
-    return model, tokenizer
 def clone_cache(cache):
     new_cache = DynamicCache()
@@ -106,7 +114,8 @@ def load_document_and_cache(file_path):
         model, tokenizer = load_model_and_tokenizer(doc_text_count)
         system_prompt = f"""
         <|system|>
-        Answer concisely and precisely. You are an assistant who provides concise factual answers.
         <|user|>
         Context:
         {doc_text}

     return total_memory /(1024*1024)
 @st.cache_resource
+def load_quantized_model_and_tokenizer():
+    model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # Configure quantization for 4-bit loading
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,  # Enable 4-bit quantization
+        bnb_4bit_compute_dtype=torch.float16,  # Set computation precision
+        bnb_4bit_quant_type="nf4",  # Use Normal Float 4 (NF4) quantization
+        bnb_4bit_use_double_quant=True,  # Enable double quantization
     )
+    # Load the pre-trained model with quantization
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
+        device_map="auto",  # Automatically allocate model to devices
+        quantization_config=quantization_config,
+        token=hf_token,
+    )
+    # Load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        token=hf_token,
     )
+    return tokenizer, model
 def clone_cache(cache):
     new_cache = DynamicCache()
         model, tokenizer = load_model_and_tokenizer(doc_text_count)
         system_prompt = f"""
         <|system|>
+        You are a helpful assistant. Provide concise, factual answers based only on the provided context.
+        If the information is not available, respond with: "I'm sorry, I don't have enough information to answer that."
         <|user|>
         Context:
         {doc_text}