edbeeching commited on
Commit
30e16b4
Β·
1 Parent(s): 174a6bc

improving docs

Browse files
Files changed (1) hide show
  1. app.py +80 -41
app.py CHANGED
@@ -119,17 +119,9 @@ def fetch_model_generation_params(model_name: str) -> dict:
119
  "recommended_max_tokens": recommended_max_tokens
120
  }
121
  else:
122
- # Use model-specific defaults based on model name
123
- if "qwen" in model_name.lower():
124
- params = {"max_tokens": recommended_max_tokens, "temperature": 0.7, "top_k": 50, "top_p": 0.8, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
125
- elif "llama" in model_name.lower():
126
- params = {"max_tokens": recommended_max_tokens, "temperature": 0.6, "top_k": 40, "top_p": 0.9, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
127
- elif "ernie" in model_name.lower():
128
- params = {"max_tokens": min(recommended_max_tokens, 1024), "temperature": 0.7, "top_k": 50, "top_p": 0.95, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
129
- else:
130
- params = dict(default_params)
131
- params["max_position_embeddings"] = max_position_embeddings
132
- params["recommended_max_tokens"] = recommended_max_tokens
133
 
134
  # Ensure parameters are within valid ranges
135
  params["max_tokens"] = max(256, min(params["max_tokens"], MAX_TOKENS, params["recommended_max_tokens"]))
@@ -280,7 +272,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
280
  # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
281
  # check output_dataset name is valid
282
  if request.output_dataset_name.count("/") != 1:
283
- raise Exception("Output dataset name must be in the format 'dataset_name', e.g., 'my-dataset'. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
284
 
285
  # check the output dataset is valid and accessible with the provided token
286
  try:
@@ -420,13 +412,8 @@ def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=
420
  dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
421
 
422
  # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b")
423
- model_short_name = model_name.split('/')[-1].lower()
424
  # Remove common suffixes and simplify
425
- model_short_name = model_short_name.replace('-instruct', '').replace('-2507', '').replace('_', '-')
426
- # Take first part if it's still long
427
- if len(model_short_name) > 15:
428
- parts = model_short_name.split('-')
429
- model_short_name = '-'.join(parts[:2]) if len(parts) > 1 else parts[0][:15]
430
 
431
  # Build the output name: username-model-dataset
432
  suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"
@@ -508,7 +495,7 @@ def add_request_to_db(request: GenerationRequest):
508
 
509
  supabase.table("gen-requests").insert(data).execute()
510
  except Exception as e:
511
- raise Exception("Failed to add request to database")
512
 
513
 
514
 
@@ -577,27 +564,79 @@ def main():
577
  gr.Markdown("# Synthetic Data Generation Request")
578
  with gr.Row():
579
  gr.Markdown("""
580
- Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
581
- Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
 
 
 
 
 
 
 
582
  """)
583
- with gr.Accordion("More Information", open=False):
584
  with gr.Row():
585
  gr.Markdown("""
586
- **How it works:**
587
- 1. Provide an input dataset with prompts
588
- 2. Select a public language model for generation
589
- 3. Configure generation parameters
590
- 4. Submit your request.
 
 
 
 
 
 
591
  """)
592
- gr.Markdown("""
593
-
594
- **Requirements:**
595
- - Input dataset must be publicly accessible
596
- - Model must be publicly accessible (and not gated)
597
- - Maximum 10,000 samples per dataset
598
- - Maximum of 8192 generated tokens
 
 
 
 
 
 
 
599
  """)
600
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  with gr.Tabs():
602
  with gr.TabItem("Generate Synthetic Data"):
603
  with gr.Row():
@@ -614,16 +653,16 @@ def main():
614
  )
615
  # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
616
  with gr.Row():
617
- system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
618
  gr.Markdown("### Generation Parameters")
619
  with gr.Row():
620
  with gr.Column():
621
  with gr.Row():
622
- max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
623
- temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
624
  with gr.Row():
625
- top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
626
- top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
627
 
628
  with gr.Column():
629
  with gr.Group():
@@ -632,7 +671,7 @@ def main():
632
  user_limit_info = gr.Markdown(value="πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
633
  with gr.Row():
634
  with gr.Column():
635
- input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
636
  load_info_btn = gr.Button("πŸ“Š Load Dataset Info", size="sm", variant="secondary")
637
  load_info_status = gr.Markdown("", visible=True)
638
 
 
119
  "recommended_max_tokens": recommended_max_tokens
120
  }
121
  else:
122
+ params = dict(default_params)
123
+ params["max_position_embeddings"] = max_position_embeddings
124
+ params["recommended_max_tokens"] = recommended_max_tokens
 
 
 
 
 
 
 
 
125
 
126
  # Ensure parameters are within valid ranges
127
  params["max_tokens"] = max(256, min(params["max_tokens"], MAX_TOKENS, params["recommended_max_tokens"]))
 
272
  # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
273
  # check output_dataset name is valid
274
  if request.output_dataset_name.count("/") != 1:
275
+ raise Exception("Output dataset will be popululated automatically. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
276
 
277
  # check the output dataset is valid and accessible with the provided token
278
  try:
 
412
  dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
413
 
414
  # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b")
415
+ model_short_name = model_name.split('/')[-1]
416
  # Remove common suffixes and simplify
 
 
 
 
 
417
 
418
  # Build the output name: username-model-dataset
419
  suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"
 
495
 
496
  supabase.table("gen-requests").insert(data).execute()
497
  except Exception as e:
498
+ raise Exception(f"Failed to add request to database: {str(e)}")
499
 
500
 
501
 
 
564
  gr.Markdown("# Synthetic Data Generation Request")
565
  with gr.Row():
566
  gr.Markdown("""
567
+ πŸš€ **Generate high-quality synthetic data using state-of-the-art language models!** Perfect for training datasets, data augmentation, and research experiments.
568
+
569
+ ✨ **Features:**
570
+ - πŸ†“ **Free for PRO users** - Uses idle GPUs on the HF science cluster
571
+ - πŸ€– **20+ Popular Models** - Including Qwen, Llama, Mistral, and more
572
+ - ⚑ **Fast Processing** - Optimized for batch generation
573
+ - πŸ“Š **Up to 10K samples** - For PRO users (100 for free users)
574
+
575
+ ⚠️ **Important:** All generated datasets are **PUBLIC** and available under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).
576
  """)
577
+ with gr.Accordion("πŸ“– Complete Usage Guide", open=False):
578
  with gr.Row():
579
  gr.Markdown("""
580
+ **πŸ”„ Step-by-Step Process:**
581
+ 1. **πŸ“‚ Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
582
+ 2. **πŸ“Š Load Info**: Click "πŸ“Š Load Dataset Info" to populate configs, columns, and splits
583
+ 3. **πŸ€– Choose Model**: Select from 20+ popular instruction-tuned models
584
+ 4. **βš™οΈ Configure**: Set generation parameters (temperature, tokens, etc.)
585
+ 5. **πŸš€ Submit**: Click submit and monitor progress in the Statistics tab
586
+
587
+ **πŸ’‘ Pro Tips:**
588
+ - Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
589
+ - Start with fewer samples to test your prompt before scaling up
590
+ - Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
591
  """)
592
+ gr.Markdown("""
593
+ **πŸ“‹ Requirements & Limits:**
594
+ - βœ… Input dataset must be **publicly accessible** on HF Hub
595
+ - βœ… Model must be **publicly accessible** (not gated)
596
+ - πŸ“Š **Sample Limits:**
597
+ - πŸ†“ Free users: 100 samples max
598
+ - ⭐ PRO users: 10,000 samples max
599
+ - πŸ”€ **Token Limit:** 8,192 generated tokens per sample
600
+ - ⏱️ **Processing Time:** Varies by model size and queue status
601
+
602
+ **πŸ”’ Privacy & Usage:**
603
+ - All outputs are **PUBLIC** on Hugging Face Hub
604
+ - Datasets appear under `synthetic-data-universe` organization
605
+ - Perfect for research, training data, and open-source projects
606
  """)
607
+
608
+ with gr.Accordion("πŸ’‘ Examples & Use Cases", open=False):
609
+ gr.Markdown("""
610
+ **🎯 Popular Use Cases:**
611
+
612
+ **πŸ“š Educational Content Generation**
613
+ - Input: Questions dataset β†’ Output: Detailed explanations and answers
614
+ - Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
615
+ - Temperature: 0.3-0.5 for factual accuracy
616
+
617
+ **πŸ’¬ Conversational Data**
618
+ - Input: Conversation starters β†’ Output: Multi-turn dialogues
619
+ - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
620
+ - Temperature: 0.7-0.9 for natural variety
621
+
622
+ **πŸ”§ Code Generation**
623
+ - Input: Problem descriptions β†’ Output: Code solutions with explanations
624
+ - Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
625
+ - Temperature: 0.1-0.3 for accurate code
626
+
627
+ **πŸ“– Creative Writing**
628
+ - Input: Story prompts β†’ Output: Creative narratives
629
+ - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
630
+ - Temperature: 0.8-1.2 for creativity
631
+
632
+ **πŸ“Š Example Dataset Names to Try:**
633
+ ```
634
+ simplescaling/s1K-1.1 # Simple Q&A pairs
635
+ HuggingFaceH4/ultrachat_200k # Conversations
636
+ iamtarun/python_code_instructions_18k_alpaca # Code tasks
637
+ ```
638
+ """)
639
+
640
  with gr.Tabs():
641
  with gr.TabItem("Generate Synthetic Data"):
642
  with gr.Row():
 
653
  )
654
  # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
655
  with gr.Row():
656
+ system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.", info="Sets the AI's role/behavior. Leave empty for default model behavior.")
657
  gr.Markdown("### Generation Parameters")
658
  with gr.Row():
659
  with gr.Column():
660
  with gr.Row():
661
+ max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256, info="Maximum tokens to generate per sample. Higher = longer responses.")
662
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1, info="Creativity level: 0.1=focused, 0.7=balanced, 1.0+=creative")
663
  with gr.Row():
664
+ top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5, info="Limits word choices to top K options. Lower = more focused.")
665
+ top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05, info="Nucleus sampling: 0.9=focused, 0.95=balanced diversity")
666
 
667
  with gr.Column():
668
  with gr.Group():
 
671
  user_limit_info = gr.Markdown(value="πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
672
  with gr.Row():
673
  with gr.Column():
674
+ input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1", info="Public HF dataset with prompts to generate from")
675
  load_info_btn = gr.Button("πŸ“Š Load Dataset Info", size="sm", variant="secondary")
676
  load_info_status = gr.Markdown("", visible=True)
677