burtenshaw HF Staff commited on
Commit
6223a79
·
verified ·
1 Parent(s): 2f4fdec

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.json +1 -0
  2. tokenizer_nanogpt.py +32 -3
config.json CHANGED
@@ -6,6 +6,7 @@
6
  "auto_map": {
7
  "AutoConfig": "configuration_nanogpt.NanoGPTConfig",
8
  "AutoModel": "modeling_nanogpt.NanoGPTModel",
 
9
  "AutoTokenizer": "tokenizer_nanogpt.NanoGPTTokenizer"
10
  },
11
  "sequence_len": 2048,
 
6
  "auto_map": {
7
  "AutoConfig": "configuration_nanogpt.NanoGPTConfig",
8
  "AutoModel": "modeling_nanogpt.NanoGPTModel",
9
+ "AutoModelForCausalLM": "modeling_nanogpt.NanoGPTModel",
10
  "AutoTokenizer": "tokenizer_nanogpt.NanoGPTTokenizer"
11
  },
12
  "sequence_len": 2048,
tokenizer_nanogpt.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  import pickle
 
 
3
 
4
 
5
  class NanoGPTTokenizer:
@@ -20,9 +22,36 @@ class NanoGPTTokenizer:
20
 
21
  @classmethod
22
  def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
23
- tok_path = os.path.join(pretrained_model_name_or_path, "tokenizer.pkl")
24
- with open(tok_path, "rb") as f:
25
- enc = pickle.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return cls(enc)
27
 
28
  def encode(self, text, prepend=None):
 
1
  import os
2
  import pickle
3
+ from huggingface_hub import hf_hub_download
4
+ from huggingface_hub.utils import HfHubHTTPError
5
 
6
 
7
  class NanoGPTTokenizer:
 
22
 
23
  @classmethod
24
  def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
25
+ """
26
+ Load tokenizer from either:
27
+ - Local directory path
28
+ - Hugging Face Hub repo ID
29
+ - Cached directory (handled automatically)
30
+ """
31
+ # First, try to load from local path
32
+ local_tok_path = os.path.join(pretrained_model_name_or_path, "tokenizer.pkl")
33
+
34
+ if os.path.isfile(local_tok_path):
35
+ # Local file exists, load it directly
36
+ with open(local_tok_path, "rb") as f:
37
+ enc = pickle.load(f)
38
+ else:
39
+ # Try to download from Hugging Face Hub
40
+ try:
41
+ # This handles cache automatically and returns the cached file path
42
+ tok_path = hf_hub_download(
43
+ repo_id=pretrained_model_name_or_path,
44
+ filename="tokenizer.pkl",
45
+ **kwargs
46
+ )
47
+ with open(tok_path, "rb") as f:
48
+ enc = pickle.load(f)
49
+ except (HfHubHTTPError, OSError) as e:
50
+ raise ValueError(
51
+ f"Could not load tokenizer.pkl from {pretrained_model_name_or_path}. "
52
+ f"Make sure the path exists or the repo is accessible on the Hub."
53
+ ) from e
54
+
55
  return cls(enc)
56
 
57
  def encode(self, text, prepend=None):