Spaces:
Running
Running
Multiple pace and \n charachters fix
Browse files
app.py
CHANGED
|
@@ -2,6 +2,10 @@ import gradio as gr
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
import torch
|
| 4 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
model1_path = "modernbert.bin"
|
| 6 |
model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
|
| 7 |
model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
|
|
@@ -35,22 +39,21 @@ label_mapping = {
|
|
| 35 |
39: 'text-davinci-002', 40: 'text-davinci-003'
|
| 36 |
}
|
| 37 |
|
| 38 |
-
def clean_text(text):
|
| 39 |
-
|
| 40 |
-
text =
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
text = re.sub(r"\n\s*\n+", "\n\n", text)
|
| 44 |
-
|
| 45 |
-
text = re.sub(r"[ \t]+", " ", text)
|
| 46 |
|
| 47 |
-
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
|
| 48 |
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
def classify_text(text):
|
| 56 |
cleaned_text = clean_text(text)
|
|
@@ -60,7 +63,7 @@ def classify_text(text):
|
|
| 60 |
)
|
| 61 |
return result_message
|
| 62 |
|
| 63 |
-
inputs = tokenizer(
|
| 64 |
|
| 65 |
with torch.no_grad():
|
| 66 |
logits_1 = model_1(**inputs).logits
|
|
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
import torch
|
| 4 |
import re
|
| 5 |
+
from tokenizers import normalizers
|
| 6 |
+
from tokenizers.normalizers import Sequence, Replace, Strip
|
| 7 |
+
from tokenizers import Regex
|
| 8 |
+
|
| 9 |
model1_path = "modernbert.bin"
|
| 10 |
model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
|
| 11 |
model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
|
|
|
|
| 39 |
39: 'text-davinci-002', 40: 'text-davinci-003'
|
| 40 |
}
|
| 41 |
|
| 42 |
+
def clean_text(text: str) -> str:
|
| 43 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
| 44 |
+
text = re.sub(r'\s+([,.;:?!])', r'\1', text)
|
| 45 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
|
|
|
| 47 |
|
| 48 |
+
newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
|
| 49 |
+
join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
|
| 50 |
|
| 51 |
+
tokenizer.backend_tokenizer.normalizer = Sequence([
|
| 52 |
+
tokenizer.backend_tokenizer.normalizer,
|
| 53 |
+
join_hyphen_break,
|
| 54 |
+
newline_to_space,
|
| 55 |
+
Strip()
|
| 56 |
+
])
|
| 57 |
|
| 58 |
def classify_text(text):
|
| 59 |
cleaned_text = clean_text(text)
|
|
|
|
| 63 |
)
|
| 64 |
return result_message
|
| 65 |
|
| 66 |
+
inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
|
| 67 |
|
| 68 |
with torch.no_grad():
|
| 69 |
logits_1 = model_1(**inputs).logits
|