Spaces:
Running on CPU Upgrade

ziem-io commited on
Commit
b6ecc6e
·
1 Parent(s): 325ed03

New: Completely replace lang detection

Browse files
Files changed (2) hide show
  1. app.py +8 -66
  2. requirements.txt +2 -1
app.py CHANGED
@@ -13,6 +13,7 @@ import fasttext # Sprach-ID (lid.176)
13
  # damit Hugging Face/Tokenizer korrekt funktionieren (SentencePiece-Backends, Converter).
14
  import sentencepiece # Required für SentencePiece-basierte Tokenizer (DeBERTa v3)
15
  import tiktoken # Optionaler Converter; verhindert Fallback-Fehler/Warnungen
 
16
 
17
  # Hugging Face / Ökosystem
18
  import spaces
@@ -68,75 +69,16 @@ _ = model_flavours.load_state_dict(state, strict=False) # strict=True wenn Keys
68
  device = "cuda" if torch.cuda.is_available() else "cpu"
69
  model_flavours.to(device).eval()
70
 
71
- ##################################################################################
72
-
73
- # offizielles Mirror-Repo mit lid.176.*
74
- lid_path = hf_hub_download(
75
- repo_id="julien-c/fasttext-language-id",
76
- filename="lid.176.ftz"
77
- )
78
-
79
- lid_model = fasttext.load_model(lid_path)
80
-
81
- # robustes predict mit NumPy-2-Fix + Fallback, falls fastText nur Labels liefert
82
- def _predict_np2_compat(self, text, k=1, threshold=0.0, on_unicode_error='strict'):
83
- out = self.f.predict(text, k, threshold, on_unicode_error)
84
- # Fälle:
85
- # 1) (labels, probs)
86
- # 2) labels-only (einige Builds/SWIG-Versionen)
87
- if isinstance(out, tuple) and len(out) == 2:
88
- labels, probs = out
89
- else:
90
- labels = out
91
- # sinnvolle Defaults, falls keine Wahrscheinlichkeiten vorliegen
92
- if isinstance(labels, (list, tuple)):
93
- probs = [1.0] * len(labels)
94
- else:
95
- labels = [labels]
96
- probs = [1.0]
97
- return labels, np.asarray(probs) # np.asarray statt np.array(copy=False)
98
-
99
- # Instanz patchen
100
- lid_model.predict = types.MethodType(_predict_np2_compat, lid_model)
101
-
102
  ### Check if lang is english #####################################################
103
- def is_eng(text: str, k: int = 3, threshold: float = 0.1):
104
-
105
- out = lid_model.predict(text, k=k)
106
-
107
- # Normalisieren auf zwei Listen: labels[], probs[]
108
- labels, probs = [], []
109
-
110
- # Fall A: (labels, probs)
111
- if isinstance(out, tuple) and len(out) == 2:
112
- labels, probs = out
113
-
114
- # Fall B: [(prob, '__label__xx'), ...]
115
- elif (
116
- isinstance(out, (list, tuple))
117
- and len(out) > 0
118
- and isinstance(out[0], (list, tuple))
119
- and len(out[0]) == 2
120
- and isinstance(out[0][1], str)
121
- ):
122
- probs, labels = zip(*out) # entpacken
123
- labels, probs = list(labels), list(probs)
124
-
125
- # Fall C: ['__label__en', '__label__de', ...] (ohne Probs)
126
- elif isinstance(out, (list, tuple)) and (len(out) == 0 or isinstance(out[0], str)):
127
- labels = list(out)
128
- probs = [1.0] * len(labels) # Dummy-Prob, falls nicht geliefert
129
-
130
- else:
131
- # Unbekanntes Format
132
- return True, 0.0
133
 
134
- if "__label__en" in labels:
135
- i = labels.index("__label__en")
136
- p = float(probs[i])
137
- return (p >= threshold), p
138
 
139
- return False, 0.0
 
 
 
 
 
140
 
141
  ### Do actual prediction #########################################################
142
  @spaces.GPU(duration=10) # Sekunden GPU-Zeit pro Call
 
13
  # damit Hugging Face/Tokenizer korrekt funktionieren (SentencePiece-Backends, Converter).
14
  import sentencepiece # Required für SentencePiece-basierte Tokenizer (DeBERTa v3)
15
  import tiktoken # Optionaler Converter; verhindert Fallback-Fehler/Warnungen
16
+ from langid.langid import LanguageIdentifier, model
17
 
18
  # Hugging Face / Ökosystem
19
  import spaces
 
69
  device = "cuda" if torch.cuda.is_available() else "cpu"
70
  model_flavours.to(device).eval()
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  ### Check if lang is english #####################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ ID = LanguageIdentifier.from_modelstring(model, norm_probs=True)
 
 
 
75
 
76
+ def is_eng(text: str, min_chars: int = 6, threshold: float = 0.50):
77
+ t = (text or "").strip()
78
+ if len(t) < min_chars:
79
+ return True, 0.0
80
+ lang, prob = ID.classify(t) # prob ∈ [0,1]
81
+ return (lang == "en" and prob >= threshold), float(prob)
82
 
83
  ### Do actual prediction #########################################################
84
  @spaces.GPU(duration=10) # Sekunden GPU-Zeit pro Call
requirements.txt CHANGED
@@ -7,4 +7,5 @@ safetensors
7
  sentencepiece
8
  tiktoken
9
  accelerate>=0.30
10
- spaces
 
 
7
  sentencepiece
8
  tiktoken
9
  accelerate>=0.30
10
+ spaces
11
+ langid