Upload tokenizer
Browse files- tokenization_dart.py +8 -1
tokenization_dart.py
CHANGED
|
@@ -15,6 +15,12 @@ VOCAB_FILES_NAMES = {
|
|
| 15 |
"tag_category": "tag_category.json",
|
| 16 |
}
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
@dataclass
|
| 20 |
class Category:
|
|
@@ -63,6 +69,7 @@ class DartTokenizer(PreTrainedTokenizerFast):
|
|
| 63 |
"""Dart tokenizer"""
|
| 64 |
|
| 65 |
vocab_files_names = VOCAB_FILES_NAMES
|
|
|
|
| 66 |
|
| 67 |
def __init__(self, tag_category, **kwargs):
|
| 68 |
super().__init__(**kwargs)
|
|
@@ -137,7 +144,7 @@ class DartTokenizer(PreTrainedTokenizerFast):
|
|
| 137 |
input_ids: List[int],
|
| 138 |
category_mask: Optional[Dict[str, np.ndarray]] = None,
|
| 139 |
) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
|
| 140 |
-
"""Get the next token's vocab mask
|
| 141 |
|
| 142 |
if category_mask == None:
|
| 143 |
category_mask = self.category_mask
|
|
|
|
| 15 |
"tag_category": "tag_category.json",
|
| 16 |
}
|
| 17 |
|
| 18 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
| 19 |
+
"tag_category": {
|
| 20 |
+
"p1atdev/tokenizer_test_1": "https://huggingface.co/p1atdev/tokenizer_test_1/resolve/main/tag_category.json"
|
| 21 |
+
}
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
|
| 25 |
@dataclass
|
| 26 |
class Category:
|
|
|
|
| 69 |
"""Dart tokenizer"""
|
| 70 |
|
| 71 |
vocab_files_names = VOCAB_FILES_NAMES
|
| 72 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
| 73 |
|
| 74 |
def __init__(self, tag_category, **kwargs):
|
| 75 |
super().__init__(**kwargs)
|
|
|
|
| 144 |
input_ids: List[int],
|
| 145 |
category_mask: Optional[Dict[str, np.ndarray]] = None,
|
| 146 |
) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
|
| 147 |
+
"""Get the next token's vocab mask and a category mask"""
|
| 148 |
|
| 149 |
if category_mask == None:
|
| 150 |
category_mask = self.category_mask
|