Spaces:
Sleeping
Sleeping
Nick Sorros
commited on
Commit
·
fd5a1b3
1
Parent(s):
4709571
Update tagged grants
Browse files- tag.py +8 -4
- tagged_grants.jsonl +0 -0
tag.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
from transformers import AutoModel, AutoTokenizer
|
|
|
|
| 4 |
import srsly
|
| 5 |
import typer
|
| 6 |
|
|
@@ -22,11 +23,14 @@ def tag(data_path, tagged_data_path, sample_size: int = 10):
|
|
| 22 |
)
|
| 23 |
|
| 24 |
texts = [grant["title_and_description"] for grant in data]
|
| 25 |
-
|
| 26 |
-
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
srsly.write_jsonl(tagged_data_path, data)
|
| 32 |
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
from transformers import AutoModel, AutoTokenizer
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
import srsly
|
| 6 |
import typer
|
| 7 |
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
texts = [grant["title_and_description"] for grant in data]
|
| 26 |
+
for batch_index in tqdm(range(0, len(texts), 10)):
|
| 27 |
+
batch_texts = texts[batch_index:batch_index+10]
|
| 28 |
|
| 29 |
+
inputs = tokenizer(batch_texts, padding="max_length")
|
| 30 |
+
labels = model(**inputs, return_labels=True)
|
| 31 |
+
|
| 32 |
+
for i, tags in enumerate(labels):
|
| 33 |
+
data[i]["tags"] = tags
|
| 34 |
|
| 35 |
srsly.write_jsonl(tagged_data_path, data)
|
| 36 |
|
tagged_grants.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|