Spaces:
Sleeping
Sleeping
edbeeching
commited on
Commit
Β·
30e16b4
1
Parent(s):
174a6bc
improving docs
Browse files
app.py
CHANGED
|
@@ -119,17 +119,9 @@ def fetch_model_generation_params(model_name: str) -> dict:
|
|
| 119 |
"recommended_max_tokens": recommended_max_tokens
|
| 120 |
}
|
| 121 |
else:
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
elif "llama" in model_name.lower():
|
| 126 |
-
params = {"max_tokens": recommended_max_tokens, "temperature": 0.6, "top_k": 40, "top_p": 0.9, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
|
| 127 |
-
elif "ernie" in model_name.lower():
|
| 128 |
-
params = {"max_tokens": min(recommended_max_tokens, 1024), "temperature": 0.7, "top_k": 50, "top_p": 0.95, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
|
| 129 |
-
else:
|
| 130 |
-
params = dict(default_params)
|
| 131 |
-
params["max_position_embeddings"] = max_position_embeddings
|
| 132 |
-
params["recommended_max_tokens"] = recommended_max_tokens
|
| 133 |
|
| 134 |
# Ensure parameters are within valid ranges
|
| 135 |
params["max_tokens"] = max(256, min(params["max_tokens"], MAX_TOKENS, params["recommended_max_tokens"]))
|
|
@@ -280,7 +272,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
|
|
| 280 |
# This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
|
| 281 |
# check output_dataset name is valid
|
| 282 |
if request.output_dataset_name.count("/") != 1:
|
| 283 |
-
raise Exception("Output dataset
|
| 284 |
|
| 285 |
# check the output dataset is valid and accessible with the provided token
|
| 286 |
try:
|
|
@@ -420,13 +412,8 @@ def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=
|
|
| 420 |
dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
|
| 421 |
|
| 422 |
# Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b")
|
| 423 |
-
model_short_name = model_name.split('/')[-1]
|
| 424 |
# Remove common suffixes and simplify
|
| 425 |
-
model_short_name = model_short_name.replace('-instruct', '').replace('-2507', '').replace('_', '-')
|
| 426 |
-
# Take first part if it's still long
|
| 427 |
-
if len(model_short_name) > 15:
|
| 428 |
-
parts = model_short_name.split('-')
|
| 429 |
-
model_short_name = '-'.join(parts[:2]) if len(parts) > 1 else parts[0][:15]
|
| 430 |
|
| 431 |
# Build the output name: username-model-dataset
|
| 432 |
suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"
|
|
@@ -508,7 +495,7 @@ def add_request_to_db(request: GenerationRequest):
|
|
| 508 |
|
| 509 |
supabase.table("gen-requests").insert(data).execute()
|
| 510 |
except Exception as e:
|
| 511 |
-
raise Exception("Failed to add request to database")
|
| 512 |
|
| 513 |
|
| 514 |
|
|
@@ -577,27 +564,79 @@ def main():
|
|
| 577 |
gr.Markdown("# Synthetic Data Generation Request")
|
| 578 |
with gr.Row():
|
| 579 |
gr.Markdown("""
|
| 580 |
-
|
| 581 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
""")
|
| 583 |
-
with gr.Accordion("
|
| 584 |
with gr.Row():
|
| 585 |
gr.Markdown("""
|
| 586 |
-
|
| 587 |
-
1.
|
| 588 |
-
2.
|
| 589 |
-
3.
|
| 590 |
-
4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
""")
|
| 592 |
-
gr.Markdown("""
|
| 593 |
-
|
| 594 |
-
**
|
| 595 |
-
-
|
| 596 |
-
-
|
| 597 |
-
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
""")
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
with gr.Tabs():
|
| 602 |
with gr.TabItem("Generate Synthetic Data"):
|
| 603 |
with gr.Row():
|
|
@@ -614,16 +653,16 @@ def main():
|
|
| 614 |
)
|
| 615 |
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 616 |
with gr.Row():
|
| 617 |
-
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
|
| 618 |
gr.Markdown("### Generation Parameters")
|
| 619 |
with gr.Row():
|
| 620 |
with gr.Column():
|
| 621 |
with gr.Row():
|
| 622 |
-
max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
|
| 623 |
-
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
|
| 624 |
with gr.Row():
|
| 625 |
-
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 626 |
-
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 627 |
|
| 628 |
with gr.Column():
|
| 629 |
with gr.Group():
|
|
@@ -632,7 +671,7 @@ def main():
|
|
| 632 |
user_limit_info = gr.Markdown(value="π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
|
| 633 |
with gr.Row():
|
| 634 |
with gr.Column():
|
| 635 |
-
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 636 |
load_info_btn = gr.Button("π Load Dataset Info", size="sm", variant="secondary")
|
| 637 |
load_info_status = gr.Markdown("", visible=True)
|
| 638 |
|
|
|
|
| 119 |
"recommended_max_tokens": recommended_max_tokens
|
| 120 |
}
|
| 121 |
else:
|
| 122 |
+
params = dict(default_params)
|
| 123 |
+
params["max_position_embeddings"] = max_position_embeddings
|
| 124 |
+
params["recommended_max_tokens"] = recommended_max_tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
# Ensure parameters are within valid ranges
|
| 127 |
params["max_tokens"] = max(256, min(params["max_tokens"], MAX_TOKENS, params["recommended_max_tokens"]))
|
|
|
|
| 272 |
# This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
|
| 273 |
# check output_dataset name is valid
|
| 274 |
if request.output_dataset_name.count("/") != 1:
|
| 275 |
+
raise Exception("Output dataset will be popululated automatically. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
|
| 276 |
|
| 277 |
# check the output dataset is valid and accessible with the provided token
|
| 278 |
try:
|
|
|
|
| 412 |
dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
|
| 413 |
|
| 414 |
# Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b")
|
| 415 |
+
model_short_name = model_name.split('/')[-1]
|
| 416 |
# Remove common suffixes and simplify
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
# Build the output name: username-model-dataset
|
| 419 |
suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"
|
|
|
|
| 495 |
|
| 496 |
supabase.table("gen-requests").insert(data).execute()
|
| 497 |
except Exception as e:
|
| 498 |
+
raise Exception(f"Failed to add request to database: {str(e)}")
|
| 499 |
|
| 500 |
|
| 501 |
|
|
|
|
| 564 |
gr.Markdown("# Synthetic Data Generation Request")
|
| 565 |
with gr.Row():
|
| 566 |
gr.Markdown("""
|
| 567 |
+
π **Generate high-quality synthetic data using state-of-the-art language models!** Perfect for training datasets, data augmentation, and research experiments.
|
| 568 |
+
|
| 569 |
+
β¨ **Features:**
|
| 570 |
+
- π **Free for PRO users** - Uses idle GPUs on the HF science cluster
|
| 571 |
+
- π€ **20+ Popular Models** - Including Qwen, Llama, Mistral, and more
|
| 572 |
+
- β‘ **Fast Processing** - Optimized for batch generation
|
| 573 |
+
- π **Up to 10K samples** - For PRO users (100 for free users)
|
| 574 |
+
|
| 575 |
+
β οΈ **Important:** All generated datasets are **PUBLIC** and available under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).
|
| 576 |
""")
|
| 577 |
+
with gr.Accordion("π Complete Usage Guide", open=False):
|
| 578 |
with gr.Row():
|
| 579 |
gr.Markdown("""
|
| 580 |
+
**π Step-by-Step Process:**
|
| 581 |
+
1. **π Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
|
| 582 |
+
2. **π Load Info**: Click "π Load Dataset Info" to populate configs, columns, and splits
|
| 583 |
+
3. **π€ Choose Model**: Select from 20+ popular instruction-tuned models
|
| 584 |
+
4. **βοΈ Configure**: Set generation parameters (temperature, tokens, etc.)
|
| 585 |
+
5. **π Submit**: Click submit and monitor progress in the Statistics tab
|
| 586 |
+
|
| 587 |
+
**π‘ Pro Tips:**
|
| 588 |
+
- Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
|
| 589 |
+
- Start with fewer samples to test your prompt before scaling up
|
| 590 |
+
- Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
|
| 591 |
""")
|
| 592 |
+
gr.Markdown("""
|
| 593 |
+
**π Requirements & Limits:**
|
| 594 |
+
- β
Input dataset must be **publicly accessible** on HF Hub
|
| 595 |
+
- β
Model must be **publicly accessible** (not gated)
|
| 596 |
+
- π **Sample Limits:**
|
| 597 |
+
- π Free users: 100 samples max
|
| 598 |
+
- β PRO users: 10,000 samples max
|
| 599 |
+
- π€ **Token Limit:** 8,192 generated tokens per sample
|
| 600 |
+
- β±οΈ **Processing Time:** Varies by model size and queue status
|
| 601 |
+
|
| 602 |
+
**π Privacy & Usage:**
|
| 603 |
+
- All outputs are **PUBLIC** on Hugging Face Hub
|
| 604 |
+
- Datasets appear under `synthetic-data-universe` organization
|
| 605 |
+
- Perfect for research, training data, and open-source projects
|
| 606 |
""")
|
| 607 |
+
|
| 608 |
+
with gr.Accordion("π‘ Examples & Use Cases", open=False):
|
| 609 |
+
gr.Markdown("""
|
| 610 |
+
**π― Popular Use Cases:**
|
| 611 |
+
|
| 612 |
+
**π Educational Content Generation**
|
| 613 |
+
- Input: Questions dataset β Output: Detailed explanations and answers
|
| 614 |
+
- Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
|
| 615 |
+
- Temperature: 0.3-0.5 for factual accuracy
|
| 616 |
+
|
| 617 |
+
**π¬ Conversational Data**
|
| 618 |
+
- Input: Conversation starters β Output: Multi-turn dialogues
|
| 619 |
+
- Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
|
| 620 |
+
- Temperature: 0.7-0.9 for natural variety
|
| 621 |
+
|
| 622 |
+
**π§ Code Generation**
|
| 623 |
+
- Input: Problem descriptions β Output: Code solutions with explanations
|
| 624 |
+
- Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
|
| 625 |
+
- Temperature: 0.1-0.3 for accurate code
|
| 626 |
+
|
| 627 |
+
**π Creative Writing**
|
| 628 |
+
- Input: Story prompts β Output: Creative narratives
|
| 629 |
+
- Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
|
| 630 |
+
- Temperature: 0.8-1.2 for creativity
|
| 631 |
+
|
| 632 |
+
**π Example Dataset Names to Try:**
|
| 633 |
+
```
|
| 634 |
+
simplescaling/s1K-1.1 # Simple Q&A pairs
|
| 635 |
+
HuggingFaceH4/ultrachat_200k # Conversations
|
| 636 |
+
iamtarun/python_code_instructions_18k_alpaca # Code tasks
|
| 637 |
+
```
|
| 638 |
+
""")
|
| 639 |
+
|
| 640 |
with gr.Tabs():
|
| 641 |
with gr.TabItem("Generate Synthetic Data"):
|
| 642 |
with gr.Row():
|
|
|
|
| 653 |
)
|
| 654 |
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 655 |
with gr.Row():
|
| 656 |
+
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.", info="Sets the AI's role/behavior. Leave empty for default model behavior.")
|
| 657 |
gr.Markdown("### Generation Parameters")
|
| 658 |
with gr.Row():
|
| 659 |
with gr.Column():
|
| 660 |
with gr.Row():
|
| 661 |
+
max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256, info="Maximum tokens to generate per sample. Higher = longer responses.")
|
| 662 |
+
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1, info="Creativity level: 0.1=focused, 0.7=balanced, 1.0+=creative")
|
| 663 |
with gr.Row():
|
| 664 |
+
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5, info="Limits word choices to top K options. Lower = more focused.")
|
| 665 |
+
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05, info="Nucleus sampling: 0.9=focused, 0.95=balanced diversity")
|
| 666 |
|
| 667 |
with gr.Column():
|
| 668 |
with gr.Group():
|
|
|
|
| 671 |
user_limit_info = gr.Markdown(value="π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
|
| 672 |
with gr.Row():
|
| 673 |
with gr.Column():
|
| 674 |
+
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1", info="Public HF dataset with prompts to generate from")
|
| 675 |
load_info_btn = gr.Button("π Load Dataset Info", size="sm", variant="secondary")
|
| 676 |
load_info_status = gr.Markdown("", visible=True)
|
| 677 |
|