Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import requests
|
| 3 |
import json
|
|
@@ -32,7 +33,7 @@ def fetch_splits(dataset_name):
|
|
| 32 |
"viewer_template": f"https://huggingface.co/datasets/{dataset_name}/embed/viewer/{{config}}/{{split}}"
|
| 33 |
}
|
| 34 |
except Exception as e:
|
| 35 |
-
raise gr.Error(f"Σφάλμα
|
| 36 |
|
| 37 |
def update_components(dataset_name):
|
| 38 |
if not dataset_name:
|
|
@@ -42,7 +43,6 @@ def update_components(dataset_name):
|
|
| 42 |
splits_data = fetch_splits(dataset_name)
|
| 43 |
config_choices = list(splits_data['splits'].keys())
|
| 44 |
|
| 45 |
-
# Δημιουργία iframe preview για το πρώτο config
|
| 46 |
first_config = config_choices[0] if config_choices else None
|
| 47 |
iframe_html = f"""
|
| 48 |
<iframe
|
|
@@ -84,26 +84,42 @@ def create_iterator(dataset_name, config, split):
|
|
| 84 |
except Exception as e:
|
| 85 |
raise gr.Error(f"Σφάλμα φόρτωσης dataset: {str(e)}")
|
| 86 |
|
| 87 |
-
def train_and_test(dataset_name, config, split, vocab_size, min_freq, test_text):
|
| 88 |
-
# Εκπαίδευση και validation logic
|
| 89 |
try:
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
with gr.Progress() as progress:
|
| 93 |
progress(0.2, desc="Δημιουργία tokenizer...")
|
| 94 |
-
tokenizer = train_tokenizer(
|
| 95 |
|
| 96 |
-
# Αποθήκευση και φόρτωση tokenizer
|
| 97 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as f:
|
| 98 |
tokenizer.save(f.name)
|
| 99 |
trained_tokenizer = Tokenizer.from_file(f.name)
|
| 100 |
os.unlink(f.name)
|
| 101 |
|
| 102 |
-
# Validation
|
| 103 |
encoded = trained_tokenizer.encode(test_text)
|
| 104 |
decoded = trained_tokenizer.decode(encoded.ids)
|
| 105 |
|
| 106 |
-
# Δημιουργία γραφήματος
|
| 107 |
token_lengths = [len(t) for t in encoded.tokens]
|
| 108 |
fig = plt.figure()
|
| 109 |
plt.hist(token_lengths, bins=20)
|
|
@@ -135,7 +151,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 135 |
placeholder="π.χ. 'wikimedia/wikipedia'"
|
| 136 |
)
|
| 137 |
config = gr.Dropdown(
|
| 138 |
-
label="Config",
|
| 139 |
choices=[],
|
| 140 |
interactive=True
|
| 141 |
)
|
|
@@ -150,6 +166,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 150 |
value='Η Ακρόπολη είναι σύμβολο της αρχαίας ελληνικής πολιτισμικής κληρονομιάς.',
|
| 151 |
label="Test Text"
|
| 152 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
train_btn = gr.Button("Εκπαίδευση", variant="primary")
|
| 154 |
|
| 155 |
with gr.Column():
|
|
@@ -172,7 +193,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 172 |
|
| 173 |
train_btn.click(
|
| 174 |
fn=train_and_test,
|
| 175 |
-
inputs=[dataset_name, config, split, vocab_size, min_freq, test_text],
|
| 176 |
outputs=[results_json, results_plot]
|
| 177 |
)
|
| 178 |
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
import json
|
|
|
|
| 33 |
"viewer_template": f"https://huggingface.co/datasets/{dataset_name}/embed/viewer/{{config}}/{{split}}"
|
| 34 |
}
|
| 35 |
except Exception as e:
|
| 36 |
+
raise gr.Error(f"Σφάλμα κατά την ανάκτηση των splits: {str(e)}")
|
| 37 |
|
| 38 |
def update_components(dataset_name):
|
| 39 |
if not dataset_name:
|
|
|
|
| 43 |
splits_data = fetch_splits(dataset_name)
|
| 44 |
config_choices = list(splits_data['splits'].keys())
|
| 45 |
|
|
|
|
| 46 |
first_config = config_choices[0] if config_choices else None
|
| 47 |
iframe_html = f"""
|
| 48 |
<iframe
|
|
|
|
| 84 |
except Exception as e:
|
| 85 |
raise gr.Error(f"Σφάλμα φόρτωσης dataset: {str(e)}")
|
| 86 |
|
| 87 |
+
def train_and_test(dataset_name, config, split, vocab_size, min_freq, test_text, custom_files):
|
|
|
|
| 88 |
try:
|
| 89 |
+
dataset_iterator = create_iterator(dataset_name, config, split)
|
| 90 |
+
|
| 91 |
+
# Συνδυασμός iterator από το streaming dataset και των custom αρχείων
|
| 92 |
+
def combined_iterator():
|
| 93 |
+
# Δεδομένα από το streaming dataset
|
| 94 |
+
for text in dataset_iterator:
|
| 95 |
+
if text:
|
| 96 |
+
yield text
|
| 97 |
+
# Δεδομένα από τα custom αρχεία (αναμένεται λίστα με file paths)
|
| 98 |
+
if custom_files:
|
| 99 |
+
for file_path in custom_files:
|
| 100 |
+
try:
|
| 101 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 102 |
+
content = f.read()
|
| 103 |
+
if content:
|
| 104 |
+
yield content
|
| 105 |
+
except Exception as file_error:
|
| 106 |
+
print(f"Σφάλμα ανάγνωσης αρχείου {file_path}: {file_error}")
|
| 107 |
|
| 108 |
with gr.Progress() as progress:
|
| 109 |
progress(0.2, desc="Δημιουργία tokenizer...")
|
| 110 |
+
tokenizer = train_tokenizer(combined_iterator(), vocab_size, min_freq)
|
| 111 |
|
| 112 |
+
# Αποθήκευση και φόρτωση του εκπαιδευμένου tokenizer
|
| 113 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as f:
|
| 114 |
tokenizer.save(f.name)
|
| 115 |
trained_tokenizer = Tokenizer.from_file(f.name)
|
| 116 |
os.unlink(f.name)
|
| 117 |
|
| 118 |
+
# Validation: κωδικοποίηση και αποκωδικοποίηση του test κειμένου
|
| 119 |
encoded = trained_tokenizer.encode(test_text)
|
| 120 |
decoded = trained_tokenizer.decode(encoded.ids)
|
| 121 |
|
| 122 |
+
# Δημιουργία γραφήματος για την κατανομή των μηκών των tokens
|
| 123 |
token_lengths = [len(t) for t in encoded.tokens]
|
| 124 |
fig = plt.figure()
|
| 125 |
plt.hist(token_lengths, bins=20)
|
|
|
|
| 151 |
placeholder="π.χ. 'wikimedia/wikipedia'"
|
| 152 |
)
|
| 153 |
config = gr.Dropdown(
|
| 154 |
+
label="Config (π.χ. '20231101.el' για ελληνικά ή '20231101.en' για αγγλικά)",
|
| 155 |
choices=[],
|
| 156 |
interactive=True
|
| 157 |
)
|
|
|
|
| 166 |
value='Η Ακρόπολη είναι σύμβολο της αρχαίας ελληνικής πολιτισμικής κληρονομιάς.',
|
| 167 |
label="Test Text"
|
| 168 |
)
|
| 169 |
+
custom_files = gr.File(
|
| 170 |
+
label="Προσαρμοσμένα Ελληνικά Κείμενα",
|
| 171 |
+
file_count="multiple",
|
| 172 |
+
type="file"
|
| 173 |
+
)
|
| 174 |
train_btn = gr.Button("Εκπαίδευση", variant="primary")
|
| 175 |
|
| 176 |
with gr.Column():
|
|
|
|
| 193 |
|
| 194 |
train_btn.click(
|
| 195 |
fn=train_and_test,
|
| 196 |
+
inputs=[dataset_name, config, split, vocab_size, min_freq, test_text, custom_files],
|
| 197 |
outputs=[results_json, results_plot]
|
| 198 |
)
|
| 199 |
|