Spaces:

ziem-io
/

whisky-wheel

Running on CPU Upgrade

App Files Files

ziem-io commited on Sep 24

Commit

b6ecc6e

1 Parent(s): 325ed03

New: Completely replace lang detection

Browse files

Files changed (2) hide show

app.py +8 -66
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ import fasttext                   # Sprach-ID (lid.176)
 # damit Hugging Face/Tokenizer korrekt funktionieren (SentencePiece-Backends, Converter).
 import sentencepiece              # Required für SentencePiece-basierte Tokenizer (DeBERTa v3)
 import tiktoken                   # Optionaler Converter; verhindert Fallback-Fehler/Warnungen
 # Hugging Face / Ökosystem
 import spaces
@@ -68,75 +69,16 @@ _ = model_flavours.load_state_dict(state, strict=False)  # strict=True wenn Keys
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_flavours.to(device).eval()
-##################################################################################
-# offizielles Mirror-Repo mit lid.176.*
-lid_path = hf_hub_download(
-    repo_id="julien-c/fasttext-language-id",
-    filename="lid.176.ftz"
-)
-lid_model = fasttext.load_model(lid_path)
-# robustes predict mit NumPy-2-Fix + Fallback, falls fastText nur Labels liefert
-def _predict_np2_compat(self, text, k=1, threshold=0.0, on_unicode_error='strict'):
-    out = self.f.predict(text, k, threshold, on_unicode_error)
-    # Fälle:
-    # 1) (labels, probs)
-    # 2) labels-only (einige Builds/SWIG-Versionen)
-    if isinstance(out, tuple) and len(out) == 2:
-        labels, probs = out
-    else:
-        labels = out
-        # sinnvolle Defaults, falls keine Wahrscheinlichkeiten vorliegen
-        if isinstance(labels, (list, tuple)):
-            probs = [1.0] * len(labels)
-        else:
-            labels = [labels]
-            probs = [1.0]
-    return labels, np.asarray(probs)  # np.asarray statt np.array(copy=False)
-# Instanz patchen
-lid_model.predict = types.MethodType(_predict_np2_compat, lid_model)
 ### Check if lang is english #####################################################
-def is_eng(text: str, k: int = 3, threshold: float = 0.1):
-    out = lid_model.predict(text, k=k)
-    # Normalisieren auf zwei Listen: labels[], probs[]
-    labels, probs = [], []
-    # Fall A: (labels, probs)
-    if isinstance(out, tuple) and len(out) == 2:
-        labels, probs = out
-    # Fall B: [(prob, '__label__xx'), ...]
-    elif (
-        isinstance(out, (list, tuple))
-        and len(out) > 0
-        and isinstance(out[0], (list, tuple))
-        and len(out[0]) == 2
-        and isinstance(out[0][1], str)
-    ):
-        probs, labels = zip(*out)            # entpacken
-        labels, probs = list(labels), list(probs)
-    # Fall C: ['__label__en', '__label__de', ...]  (ohne Probs)
-    elif isinstance(out, (list, tuple)) and (len(out) == 0 or isinstance(out[0], str)):
-        labels = list(out)
-        probs = [1.0] * len(labels)          # Dummy-Prob, falls nicht geliefert
-    else:
-        # Unbekanntes Format
-        return True, 0.0
-    if "__label__en" in labels:
-        i = labels.index("__label__en")
-        p = float(probs[i])
-        return (p >= threshold), p
-    return False, 0.0
 ### Do actual prediction #########################################################
 @spaces.GPU(duration=10)  # Sekunden GPU-Zeit pro Call

 # damit Hugging Face/Tokenizer korrekt funktionieren (SentencePiece-Backends, Converter).
 import sentencepiece              # Required für SentencePiece-basierte Tokenizer (DeBERTa v3)
 import tiktoken                   # Optionaler Converter; verhindert Fallback-Fehler/Warnungen
+from langid.langid import LanguageIdentifier, model
 # Hugging Face / Ökosystem
 import spaces
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_flavours.to(device).eval()
 ### Check if lang is english #####################################################
+ID = LanguageIdentifier.from_modelstring(model, norm_probs=True)
+def is_eng(text: str, min_chars: int = 6, threshold: float = 0.50):
+    t = (text or "").strip()
+    if len(t) < min_chars:
+        return True, 0.0
+    lang, prob = ID.classify(t)  # prob ∈ [0,1]
+    return (lang == "en" and prob >= threshold), float(prob)
 ### Do actual prediction #########################################################
 @spaces.GPU(duration=10)  # Sekunden GPU-Zeit pro Call

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ safetensors
 sentencepiece
 tiktoken
 accelerate>=0.30
-spaces

 sentencepiece
 tiktoken
 accelerate>=0.30
+spaces
+langid