Spaces:

RishiRP
/

Talk2TaskDemo1

Sleeping

App Files Files Community

RishiRP commited on Sep 26

Commit

e84ddb8

verified ·

1 Parent(s): 2f8734d

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -41

app.py CHANGED Viewed

@@ -17,7 +17,9 @@ from transformers import (
     AutoModelForCausalLM,
     BitsAndBytesConfig,
     GenerationConfig,
 )
 # =========================
 # Global config
@@ -26,7 +28,7 @@ SPACE_CACHE = Path.home() / ".cache" / "huggingface"
 SPACE_CACHE.mkdir(parents=True, exist_ok=True)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Force slow tokenizer path by default; avoids Rust tokenizer.json parsing issues
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 os.environ.setdefault("TOKENIZERS_PREFER_FAST", "false")
@@ -72,7 +74,7 @@ DEFAULT_LABEL_GLOSSARY = {
     "update_kyc_total_assets": "Discussion/confirmation of total assets/net worth.",
 }
-# Tiny multilingual fallback rules (optional) to avoid empty outputs
 DEFAULT_FALLBACK_CUES = {
     "plan_contact": [
         r"\b(get|got|will|we'?ll|i'?ll)\s+back to you\b", r"\bfollow\s*up\b", r"\breach out\b", r"\btouch base\b",
@@ -253,7 +255,6 @@ def truncate_tokens(tokenizer, text: str, max_tokens: int) -> str:
 # Cache purge for fresh downloads
 # =========================
 def _purge_repo_from_cache(repo_id: str):
-    """Delete cached files of a specific repo to guarantee a fresh download."""
     try:
         base = SPACE_CACHE
         safe = repo_id.replace("/", "--")
@@ -277,7 +278,7 @@ def _purge_repo_from_cache(repo_id: str):
         pass
 # =========================
-# HF model wrapper (robust: slow tokenizer first + load fallbacks)
 # =========================
 class ModelWrapper:
     def __init__(self, repo_id: str, hf_token: Optional[str], load_in_4bit: bool, use_sdpa: bool, force_tok_redownload: bool):
@@ -290,52 +291,52 @@ class ModelWrapper:
         self.model = None
         self.load_path = "uninitialized"
-    def _load_tokenizer(self):
-        """
-        Prefer the slow (SentencePiece) tokenizer first to avoid Rust tokenizers JSON parsing.
-        If user asked to force fresh download, purge local cache first.
-        """
-        if self.force_tok_redownload:
-            _purge_repo_from_cache(self.repo_id)
-        common = dict(
-            pretrained_model_name_or_path=self.repo_id,
             token=self.hf_token,
             cache_dir=str(SPACE_CACHE),
             trust_remote_code=True,
             local_files_only=False,
             force_download=True if self.force_tok_redownload else False,
-            revision=None,
         )
-        # 1) SLOW PATH FIRST
-        slow_err = None
-        tok = None
-        try:
-            tok = AutoTokenizer.from_pretrained(use_fast=False, **common)
-        except Exception as e:
-            slow_err = e
-        # 2) If slow somehow failed, try FAST as a last resort
-        fast_err = None
-        if tok is None:
-            try:
-                tok = AutoTokenizer.from_pretrained(use_fast=True, **common)
-            except Exception as e:
-                fast_err = e
-        if tok is None:
-            raise RuntimeError(f"Tokenizer failed (slow: {slow_err}) (fast: {fast_err})")
         if tok.pad_token is None and tok.eos_token:
             tok.pad_token = tok.eos_token
-        # Tag which path we used
-        if slow_err is None:
-            self.load_path = "tok:SLOW"
-        else:
-            self.load_path = "tok:FAST"
         return tok
     def load(self):
@@ -989,14 +990,13 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo
         )
 if __name__ == "__main__":
-    # Optional: print environment info to logs
     try:
-        print("Torch version:", torch.__version__)
         print("CUDA available:", torch.cuda.is_available())
         if torch.cuda.is_available():
             print("CUDA (compiled):", torch.version.cuda)
             print("Device:", torch.cuda.get_device_name(0))
-    except Exception as _:
         pass
     demo.launch()

     AutoModelForCausalLM,
     BitsAndBytesConfig,
     GenerationConfig,
+    LlamaTokenizer,  # manual fallback
 )
+from huggingface_hub import hf_hub_download
 # =========================
 # Global config
 SPACE_CACHE.mkdir(parents=True, exist_ok=True)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Force slow path by default; avoid Rust tokenizer JSON parsing
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 os.environ.setdefault("TOKENIZERS_PREFER_FAST", "false")
     "update_kyc_total_assets": "Discussion/confirmation of total assets/net worth.",
 }
+# Minimal multilingual fallback rules (optional)
 DEFAULT_FALLBACK_CUES = {
     "plan_contact": [
         r"\b(get|got|will|we'?ll|i'?ll)\s+back to you\b", r"\bfollow\s*up\b", r"\breach out\b", r"\btouch base\b",
 # Cache purge for fresh downloads
 # =========================
 def _purge_repo_from_cache(repo_id: str):
     try:
         base = SPACE_CACHE
         safe = repo_id.replace("/", "--")
         pass
 # =========================
+# HF model wrapper (with manual LlamaTokenizer fallback)
 # =========================
 class ModelWrapper:
     def __init__(self, repo_id: str, hf_token: Optional[str], load_in_4bit: bool, use_sdpa: bool, force_tok_redownload: bool):
         self.model = None
         self.load_path = "uninitialized"
+    def _try_auto_tokenizer(self, use_fast: bool):
+        return AutoTokenizer.from_pretrained(
+            self.repo_id,
             token=self.hf_token,
             cache_dir=str(SPACE_CACHE),
             trust_remote_code=True,
             local_files_only=False,
             force_download=True if self.force_tok_redownload else False,
+            use_fast=use_fast,
         )
+    def _try_manual_llama_tokenizer(self):
+        # Download only tokenizer.model; ignore tokenizer.json entirely
+        sp_path = hf_hub_download(repo_id=self.repo_id, filename="tokenizer.model", token=self.hf_token, cache_dir=str(SPACE_CACHE))
+        tok = LlamaTokenizer(vocab_file=sp_path)
+        if tok.pad_token is None and tok.eos_token:
+            tok.pad_token = tok.eos_token
+        return tok
+    def _load_tokenizer(self):
+        if self.force_tok_redownload:
+            _purge_repo_from_cache(self.repo_id)
+        # 1) Slow auto
+        try:
+            tok = self._try_auto_tokenizer(use_fast=False)
+            if tok.pad_token is None and tok.eos_token:
+                tok.pad_token = tok.eos_token
+            self.load_path = "tok:AUTO_SLOW"
+            return tok
+        except Exception:
+            pass
+        # 2) Manual LlamaTokenizer from tokenizer.model
+        try:
+            tok = self._try_manual_llama_tokenizer()
+            self.load_path = "tok:LLAMA_SPM"
+            return tok
+        except Exception:
+            pass
+        # 3) Fast auto (last resort)
+        tok = self._try_auto_tokenizer(use_fast=True)  # will raise if broken
         if tok.pad_token is None and tok.eos_token:
             tok.pad_token = tok.eos_token
+        self.load_path = "tok:AUTO_FAST"
         return tok
     def load(self):
         )
 if __name__ == "__main__":
     try:
+        print("Torch:", torch.__version__)
         print("CUDA available:", torch.cuda.is_available())
         if torch.cuda.is_available():
             print("CUDA (compiled):", torch.version.cuda)
             print("Device:", torch.cuda.get_device_name(0))
+    except Exception:
         pass
     demo.launch()