Spaces:

cifkao
/

context-probing

Running

App Files Files Community

cifkao commited on May 23, 2023

Commit

f90dfb4

1 Parent(s): b837582

Implement KL divergence score

Browse files

Files changed (1) hide show

app.py +33 -8

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 import streamlit as st
 import streamlit.components.v1 as components
 import torch
 import torch.nn.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
@@ -41,6 +42,28 @@ def ids_to_readable_tokens(tokenizer, ids, strip_whitespace=False):
             result.append("")
     return result
 compact_layout = st.experimental_get_query_params().get("compact", ["false"]) == ["true"]
 if not compact_layout:
@@ -53,7 +76,7 @@ if not compact_layout:
     )
 model_name = st.selectbox("Model", ["distilgpt2", "gpt2", "EleutherAI/gpt-neo-125m"])
-metric_name = st.selectbox("Metric", ["KL divergence", "Cross entropy"], index=1)
 tokenizer = st.cache_resource(AutoTokenizer.from_pretrained, show_spinner=False)(model_name, use_fast=False)
@@ -107,10 +130,6 @@ if num_user_tokens > max_tokens:
     )
     st.stop()
-if metric_name == "KL divergence":
-    st.error("KL divergence is not supported yet. Stay tuned!", icon="😭")
-    st.stop()
 with st.spinner("Loading model…"):
     model = st.cache_resource(AutoModelForCausalLM.from_pretrained, show_spinner=False)(model_name)
@@ -124,7 +143,7 @@ def get_logprobs(_model, _inputs, cache_key):
 @st.cache_data(show_spinner=False)
 @torch.inference_mode()
-def run_context_length_probing(_model, _tokenizer, _inputs, window_len, cache_key):
     del cache_key
     inputs_sliding = get_windows_batched(
@@ -157,8 +176,13 @@ def run_context_length_probing(_model, _tokenizer, _inputs, window_len, cache_ke
         logprobs = logprobs.view(-1, logprobs.shape[-1])[:-window_len]
         logprobs = logprobs.view(window_len, len(input_ids) + window_len - 2, logprobs.shape[-1])
-        scores = logprobs[:, torch.arange(len(input_ids[1:])), input_ids[1:]]
-        scores = scores.diff(dim=0).transpose(0, 1)
         scores = scores.nan_to_num()
         scores /= scores.abs().max(dim=1, keepdim=True).values + 1e-6
         scores = scores.to(torch.float16)
@@ -170,6 +194,7 @@ scores = run_context_length_probing(
     _tokenizer=tokenizer,
     _inputs=inputs,
     window_len=window_len,
     cache_key=(model_name, text),
 )
 tokens = ids_to_readable_tokens(tokenizer, input_ids)

 import streamlit as st
 import streamlit.components.v1 as components
+import numpy as np
 import torch
 import torch.nn.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
             result.append("")
     return result
+def nll_score(logprobs, labels):
+    return -logprobs[:, torch.arange(len(labels)), labels]
+def kl_div_score(logprobs):
+    log_p = logprobs[
+        torch.arange(logprobs.shape[1]).clamp(max=logprobs.shape[0] - 1),
+        torch.arange(logprobs.shape[1])
+    ]
+    # Compute things in place as much as possible
+    log_p_minus_log_q = logprobs
+    del logprobs
+    log_p_minus_log_q *= -1
+    log_p_minus_log_q += log_p
+    # Use np.exp because torch.exp is not implemented for float16
+    p_np = log_p.numpy()
+    del log_p
+    np.exp(p_np, out=p_np)
+    result = log_p_minus_log_q
+    result *= torch.as_tensor(p_np)
+    return result.sum(dim=-1)
 compact_layout = st.experimental_get_query_params().get("compact", ["false"]) == ["true"]
 if not compact_layout:
     )
 model_name = st.selectbox("Model", ["distilgpt2", "gpt2", "EleutherAI/gpt-neo-125m"])
+metric_name = st.selectbox("Metric", ["KL divergence", "NLL loss"], index=1)
 tokenizer = st.cache_resource(AutoTokenizer.from_pretrained, show_spinner=False)(model_name, use_fast=False)
     )
     st.stop()
 with st.spinner("Loading model…"):
     model = st.cache_resource(AutoModelForCausalLM.from_pretrained, show_spinner=False)(model_name)
 @st.cache_data(show_spinner=False)
 @torch.inference_mode()
+def run_context_length_probing(_model, _tokenizer, _inputs, window_len, metric, cache_key):
     del cache_key
     inputs_sliding = get_windows_batched(
         logprobs = logprobs.view(-1, logprobs.shape[-1])[:-window_len]
         logprobs = logprobs.view(window_len, len(input_ids) + window_len - 2, logprobs.shape[-1])
+        if metric == "NLL loss":
+            scores = nll_score(logprobs=logprobs, labels=input_ids[1:])
+        elif metric == "KL divergence":
+            scores = kl_div_score(logprobs)
+        del logprobs  # possibly destroyed by the score computation to save memory
+        scores = (-scores).diff(dim=0).transpose(0, 1)
         scores = scores.nan_to_num()
         scores /= scores.abs().max(dim=1, keepdim=True).values + 1e-6
         scores = scores.to(torch.float16)
     _tokenizer=tokenizer,
     _inputs=inputs,
     window_len=window_len,
+    metric=metric_name,
     cache_key=(model_name, text),
 )
 tokens = ids_to_readable_tokens(tokenizer, input_ids)