Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
New: Completely replace lang detection
Browse files- app.py +8 -66
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -13,6 +13,7 @@ import fasttext # Sprach-ID (lid.176)
|
|
| 13 |
# damit Hugging Face/Tokenizer korrekt funktionieren (SentencePiece-Backends, Converter).
|
| 14 |
import sentencepiece # Required für SentencePiece-basierte Tokenizer (DeBERTa v3)
|
| 15 |
import tiktoken # Optionaler Converter; verhindert Fallback-Fehler/Warnungen
|
|
|
|
| 16 |
|
| 17 |
# Hugging Face / Ökosystem
|
| 18 |
import spaces
|
|
@@ -68,75 +69,16 @@ _ = model_flavours.load_state_dict(state, strict=False) # strict=True wenn Keys
|
|
| 68 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 69 |
model_flavours.to(device).eval()
|
| 70 |
|
| 71 |
-
##################################################################################
|
| 72 |
-
|
| 73 |
-
# offizielles Mirror-Repo mit lid.176.*
|
| 74 |
-
lid_path = hf_hub_download(
|
| 75 |
-
repo_id="julien-c/fasttext-language-id",
|
| 76 |
-
filename="lid.176.ftz"
|
| 77 |
-
)
|
| 78 |
-
|
| 79 |
-
lid_model = fasttext.load_model(lid_path)
|
| 80 |
-
|
| 81 |
-
# robustes predict mit NumPy-2-Fix + Fallback, falls fastText nur Labels liefert
|
| 82 |
-
def _predict_np2_compat(self, text, k=1, threshold=0.0, on_unicode_error='strict'):
|
| 83 |
-
out = self.f.predict(text, k, threshold, on_unicode_error)
|
| 84 |
-
# Fälle:
|
| 85 |
-
# 1) (labels, probs)
|
| 86 |
-
# 2) labels-only (einige Builds/SWIG-Versionen)
|
| 87 |
-
if isinstance(out, tuple) and len(out) == 2:
|
| 88 |
-
labels, probs = out
|
| 89 |
-
else:
|
| 90 |
-
labels = out
|
| 91 |
-
# sinnvolle Defaults, falls keine Wahrscheinlichkeiten vorliegen
|
| 92 |
-
if isinstance(labels, (list, tuple)):
|
| 93 |
-
probs = [1.0] * len(labels)
|
| 94 |
-
else:
|
| 95 |
-
labels = [labels]
|
| 96 |
-
probs = [1.0]
|
| 97 |
-
return labels, np.asarray(probs) # np.asarray statt np.array(copy=False)
|
| 98 |
-
|
| 99 |
-
# Instanz patchen
|
| 100 |
-
lid_model.predict = types.MethodType(_predict_np2_compat, lid_model)
|
| 101 |
-
|
| 102 |
### Check if lang is english #####################################################
|
| 103 |
-
def is_eng(text: str, k: int = 3, threshold: float = 0.1):
|
| 104 |
-
|
| 105 |
-
out = lid_model.predict(text, k=k)
|
| 106 |
-
|
| 107 |
-
# Normalisieren auf zwei Listen: labels[], probs[]
|
| 108 |
-
labels, probs = [], []
|
| 109 |
-
|
| 110 |
-
# Fall A: (labels, probs)
|
| 111 |
-
if isinstance(out, tuple) and len(out) == 2:
|
| 112 |
-
labels, probs = out
|
| 113 |
-
|
| 114 |
-
# Fall B: [(prob, '__label__xx'), ...]
|
| 115 |
-
elif (
|
| 116 |
-
isinstance(out, (list, tuple))
|
| 117 |
-
and len(out) > 0
|
| 118 |
-
and isinstance(out[0], (list, tuple))
|
| 119 |
-
and len(out[0]) == 2
|
| 120 |
-
and isinstance(out[0][1], str)
|
| 121 |
-
):
|
| 122 |
-
probs, labels = zip(*out) # entpacken
|
| 123 |
-
labels, probs = list(labels), list(probs)
|
| 124 |
-
|
| 125 |
-
# Fall C: ['__label__en', '__label__de', ...] (ohne Probs)
|
| 126 |
-
elif isinstance(out, (list, tuple)) and (len(out) == 0 or isinstance(out[0], str)):
|
| 127 |
-
labels = list(out)
|
| 128 |
-
probs = [1.0] * len(labels) # Dummy-Prob, falls nicht geliefert
|
| 129 |
-
|
| 130 |
-
else:
|
| 131 |
-
# Unbekanntes Format
|
| 132 |
-
return True, 0.0
|
| 133 |
|
| 134 |
-
|
| 135 |
-
i = labels.index("__label__en")
|
| 136 |
-
p = float(probs[i])
|
| 137 |
-
return (p >= threshold), p
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
### Do actual prediction #########################################################
|
| 142 |
@spaces.GPU(duration=10) # Sekunden GPU-Zeit pro Call
|
|
|
|
| 13 |
# damit Hugging Face/Tokenizer korrekt funktionieren (SentencePiece-Backends, Converter).
|
| 14 |
import sentencepiece # Required für SentencePiece-basierte Tokenizer (DeBERTa v3)
|
| 15 |
import tiktoken # Optionaler Converter; verhindert Fallback-Fehler/Warnungen
|
| 16 |
+
from langid.langid import LanguageIdentifier, model
|
| 17 |
|
| 18 |
# Hugging Face / Ökosystem
|
| 19 |
import spaces
|
|
|
|
| 69 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 70 |
model_flavours.to(device).eval()
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
### Check if lang is english #####################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
ID = LanguageIdentifier.from_modelstring(model, norm_probs=True)
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
def is_eng(text: str, min_chars: int = 6, threshold: float = 0.50):
|
| 77 |
+
t = (text or "").strip()
|
| 78 |
+
if len(t) < min_chars:
|
| 79 |
+
return True, 0.0
|
| 80 |
+
lang, prob = ID.classify(t) # prob ∈ [0,1]
|
| 81 |
+
return (lang == "en" and prob >= threshold), float(prob)
|
| 82 |
|
| 83 |
### Do actual prediction #########################################################
|
| 84 |
@spaces.GPU(duration=10) # Sekunden GPU-Zeit pro Call
|
requirements.txt
CHANGED
|
@@ -7,4 +7,5 @@ safetensors
|
|
| 7 |
sentencepiece
|
| 8 |
tiktoken
|
| 9 |
accelerate>=0.30
|
| 10 |
-
spaces
|
|
|
|
|
|
| 7 |
sentencepiece
|
| 8 |
tiktoken
|
| 9 |
accelerate>=0.30
|
| 10 |
+
spaces
|
| 11 |
+
langid
|