Zhichao Geng
commited on
Update README.md
Browse files
README.md
CHANGED
|
@@ -21,18 +21,20 @@ import itertools
|
|
| 21 |
import torch
|
| 22 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
| 23 |
|
|
|
|
| 24 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
| 25 |
def get_sparse_vector(feature, output):
|
| 26 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
| 27 |
values = torch.log(1 + torch.relu(values))
|
|
|
|
| 28 |
return values
|
| 29 |
|
| 30 |
# transform the sparse vector to a dict of (token, weight)
|
| 31 |
-
def transform_sparse_vector_to_dict(sparse_vector
|
| 32 |
sample_indices,token_indices=torch.nonzero(sparse_vector,as_tuple=True)
|
| 33 |
non_zero_values = sparse_vector[(sample_indices,token_indices)].tolist()
|
| 34 |
number_of_tokens_for_each_sample = torch.bincount(sample_indices).cpu().tolist()
|
| 35 |
-
tokens = [id_to_token[_id] for _id in token_indices.tolist()]
|
| 36 |
|
| 37 |
output = []
|
| 38 |
end_idxs = list(itertools.accumulate([0]+number_of_tokens_for_each_sample))
|
|
@@ -47,6 +49,16 @@ def transform_sparse_vector_to_dict(sparse_vector, id_to_token):
|
|
| 47 |
model = AutoModelForMaskedLM.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
| 48 |
tokenizer = AutoTokenizer.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
query = "What's the weather in ny now?"
|
| 51 |
document = "Currently New York is rainy."
|
| 52 |
|
|
@@ -59,13 +71,8 @@ sparse_vector = get_sparse_vector(feature, output)
|
|
| 59 |
sim_score = torch.matmul(sparse_vector[0],sparse_vector[1])
|
| 60 |
print(sim_score) # tensor(22.3299, grad_fn=<DotBackward0>)
|
| 61 |
|
| 62 |
-
# get the array to transform token id to token string
|
| 63 |
-
id_to_token = ["" for i in range(tokenizer.vocab_size)]
|
| 64 |
-
for token, _id in tokenizer.vocab.items():
|
| 65 |
-
id_to_token[_id] = token
|
| 66 |
-
|
| 67 |
|
| 68 |
-
query_token_weight, document_query_token_weight = transform_sparse_vector_to_dict(sparse_vector
|
| 69 |
for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reverse=True):
|
| 70 |
if token in document_query_token_weight:
|
| 71 |
print("score in query: %.4f, score in document: %.4f, token: %s"%(query_token_weight[token],document_query_token_weight[token],token))
|
|
@@ -94,7 +101,6 @@ for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reve
|
|
| 94 |
# score in query: 0.1191, score in document: 0.1533, token: nature
|
| 95 |
# score in query: 0.0665, score in document: 0.0600, token: temperature
|
| 96 |
# score in query: 0.0552, score in document: 0.3396, token: windy
|
| 97 |
-
|
| 98 |
```
|
| 99 |
|
| 100 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|
|
|
|
| 21 |
import torch
|
| 22 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
| 23 |
|
| 24 |
+
|
| 25 |
# get sparse vector from dense vectors with shape batch_size * seq_len * vocab_size
|
| 26 |
def get_sparse_vector(feature, output):
|
| 27 |
values, _ = torch.max(output*feature["attention_mask"].unsqueeze(-1), dim=1)
|
| 28 |
values = torch.log(1 + torch.relu(values))
|
| 29 |
+
values[:,special_token_ids] = 0
|
| 30 |
return values
|
| 31 |
|
| 32 |
# transform the sparse vector to a dict of (token, weight)
|
| 33 |
+
def transform_sparse_vector_to_dict(sparse_vector):
|
| 34 |
sample_indices,token_indices=torch.nonzero(sparse_vector,as_tuple=True)
|
| 35 |
non_zero_values = sparse_vector[(sample_indices,token_indices)].tolist()
|
| 36 |
number_of_tokens_for_each_sample = torch.bincount(sample_indices).cpu().tolist()
|
| 37 |
+
tokens = [transform_sparse_vector_to_dict.id_to_token[_id] for _id in token_indices.tolist()]
|
| 38 |
|
| 39 |
output = []
|
| 40 |
end_idxs = list(itertools.accumulate([0]+number_of_tokens_for_each_sample))
|
|
|
|
| 49 |
model = AutoModelForMaskedLM.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
| 50 |
tokenizer = AutoTokenizer.from_pretrained("opensearch-project/opensearch-neural-sparse-encoding-v1")
|
| 51 |
|
| 52 |
+
# set the special tokens and id_to_token transform for post-process
|
| 53 |
+
special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()]
|
| 54 |
+
get_sparse_vector.special_token_ids = special_token_ids
|
| 55 |
+
id_to_token = ["" for i in range(tokenizer.vocab_size)]
|
| 56 |
+
for token, _id in tokenizer.vocab.items():
|
| 57 |
+
id_to_token[_id] = token
|
| 58 |
+
transform_sparse_vector_to_dict.id_to_token = id_to_token
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
query = "What's the weather in ny now?"
|
| 63 |
document = "Currently New York is rainy."
|
| 64 |
|
|
|
|
| 71 |
sim_score = torch.matmul(sparse_vector[0],sparse_vector[1])
|
| 72 |
print(sim_score) # tensor(22.3299, grad_fn=<DotBackward0>)
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
+
query_token_weight, document_query_token_weight = transform_sparse_vector_to_dict(sparse_vector)
|
| 76 |
for token in sorted(query_token_weight, key=lambda x:query_token_weight[x], reverse=True):
|
| 77 |
if token in document_query_token_weight:
|
| 78 |
print("score in query: %.4f, score in document: %.4f, token: %s"%(query_token_weight[token],document_query_token_weight[token],token))
|
|
|
|
| 101 |
# score in query: 0.1191, score in document: 0.1533, token: nature
|
| 102 |
# score in query: 0.0665, score in document: 0.0600, token: temperature
|
| 103 |
# score in query: 0.0552, score in document: 0.3396, token: windy
|
|
|
|
| 104 |
```
|
| 105 |
|
| 106 |
The above code sample shows an example of neural sparse search. Although there is no overlap token in original query and document, but this model performs a good match.
|