Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 16

Commit

30e16b4

1 Parent(s): 174a6bc

improving docs

Browse files

Files changed (1) hide show

app.py +80 -41

app.py CHANGED Viewed

@@ -119,17 +119,9 @@ def fetch_model_generation_params(model_name: str) -> dict:
                 "recommended_max_tokens": recommended_max_tokens
             }
         else:
-            # Use model-specific defaults based on model name
-            if "qwen" in model_name.lower():
-                params = {"max_tokens": recommended_max_tokens, "temperature": 0.7, "top_k": 50, "top_p": 0.8, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
-            elif "llama" in model_name.lower():
-                params = {"max_tokens": recommended_max_tokens, "temperature": 0.6, "top_k": 40, "top_p": 0.9, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
-            elif "ernie" in model_name.lower():
-                params = {"max_tokens": min(recommended_max_tokens, 1024), "temperature": 0.7, "top_k": 50, "top_p": 0.95, "max_position_embeddings": max_position_embeddings, "recommended_max_tokens": recommended_max_tokens}
-            else:
-                params = dict(default_params)
-                params["max_position_embeddings"] = max_position_embeddings
-                params["recommended_max_tokens"] = recommended_max_tokens
         # Ensure parameters are within valid ranges
         params["max_tokens"] = max(256, min(params["max_tokens"], MAX_TOKENS, params["recommended_max_tokens"]))
@@ -280,7 +272,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
     # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
     # check output_dataset name is valid
     if request.output_dataset_name.count("/") != 1:
-        raise Exception("Output dataset name must be in the format 'dataset_name', e.g., 'my-dataset'. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
     # check the output dataset is valid and accessible with the provided token
     try:
@@ -420,13 +412,8 @@ def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=
         dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
         # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b")
-        model_short_name = model_name.split('/')[-1].lower()
         # Remove common suffixes and simplify
-        model_short_name = model_short_name.replace('-instruct', '').replace('-2507', '').replace('_', '-')
-        # Take first part if it's still long
-        if len(model_short_name) > 15:
-            parts = model_short_name.split('-')
-            model_short_name = '-'.join(parts[:2]) if len(parts) > 1 else parts[0][:15]
         # Build the output name: username-model-dataset
         suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"
@@ -508,7 +495,7 @@ def add_request_to_db(request: GenerationRequest):
         supabase.table("gen-requests").insert(data).execute()
     except Exception as e:
-        raise Exception("Failed to add request to database")
@@ -577,27 +564,79 @@ def main():
                     gr.Markdown("# Synthetic Data Generation Request")
                 with gr.Row():
                     gr.Markdown("""
-                    Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
-                    Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
                     """)
-            with gr.Accordion("More Information", open=False):
                 with gr.Row():
                     gr.Markdown("""
-                    **How it works:**
-                    1. Provide an input dataset with prompts
-                    2. Select a public language model for generation
-                    3. Configure generation parameters
-                    4. Submit your request.
                     """)
-                    gr.Markdown("""
-                    **Requirements:**
-                    - Input dataset must be publicly accessible
-                    - Model must be publicly accessible (and not gated)
-                    - Maximum 10,000 samples per dataset
-                    - Maximum of 8192 generated tokens
                     """)
             with gr.Tabs():
                 with gr.TabItem("Generate Synthetic Data"):
                     with gr.Row():
@@ -614,16 +653,16 @@ def main():
                                         )
                                     # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
                                 with gr.Row():
-                                    system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
                                 gr.Markdown("### Generation Parameters")
                                 with gr.Row():
                                     with gr.Column():
                                         with gr.Row():
-                                            max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
-                                            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
                                         with gr.Row():
-                                            top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
-                                            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
                         with gr.Column():
                             with gr.Group():
@@ -632,7 +671,7 @@ def main():
                                 user_limit_info = gr.Markdown(value="👤 **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
                                 with gr.Row():
                                     with gr.Column():
-                                        input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
                                         load_info_btn = gr.Button("📊 Load Dataset Info", size="sm", variant="secondary")
                                         load_info_status = gr.Markdown("", visible=True)

                 "recommended_max_tokens": recommended_max_tokens
             }
         else:
+            params = dict(default_params)
+            params["max_position_embeddings"] = max_position_embeddings
+            params["recommended_max_tokens"] = recommended_max_tokens
         # Ensure parameters are within valid ranges
         params["max_tokens"] = max(256, min(params["max_tokens"], MAX_TOKENS, params["recommended_max_tokens"]))
     # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
     # check output_dataset name is valid
     if request.output_dataset_name.count("/") != 1:
+        raise Exception("Output dataset will be popululated automatically. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
     # check the output dataset is valid and accessible with the provided token
     try:
         dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
         # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b")
+        model_short_name = model_name.split('/')[-1]
         # Remove common suffixes and simplify
         # Build the output name: username-model-dataset
         suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"
         supabase.table("gen-requests").insert(data).execute()
     except Exception as e:
+        raise Exception(f"Failed to add request to database: {str(e)}")
                     gr.Markdown("# Synthetic Data Generation Request")
                 with gr.Row():
                     gr.Markdown("""
+                    🚀 **Generate high-quality synthetic data using state-of-the-art language models!** Perfect for training datasets, data augmentation, and research experiments.
+                    ✨ **Features:**
+                    - 🆓 **Free for PRO users** - Uses idle GPUs on the HF science cluster
+                    - 🤖 **20+ Popular Models** - Including Qwen, Llama, Mistral, and more
+                    - ⚡ **Fast Processing** - Optimized for batch generation
+                    - 📊 **Up to 10K samples** - For PRO users (100 for free users)
+                    ⚠️ **Important:** All generated datasets are **PUBLIC** and available under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).
                     """)
+            with gr.Accordion("📖 Complete Usage Guide", open=False):
                 with gr.Row():
                     gr.Markdown("""
+                    **🔄 Step-by-Step Process:**
+                    1. **📂 Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
+                    2. **📊 Load Info**: Click "📊 Load Dataset Info" to populate configs, columns, and splits
+                    3. **🤖 Choose Model**: Select from 20+ popular instruction-tuned models
+                    4. **⚙️ Configure**: Set generation parameters (temperature, tokens, etc.)
+                    5. **🚀 Submit**: Click submit and monitor progress in the Statistics tab
+                    **💡 Pro Tips:**
+                    - Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
+                    - Start with fewer samples to test your prompt before scaling up
+                    - Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
                     """)
+                    gr.Markdown("""
+                    **📋 Requirements & Limits:**
+                    - ✅ Input dataset must be **publicly accessible** on HF Hub
+                    - ✅ Model must be **publicly accessible** (not gated)
+                    - 📊 **Sample Limits:**
+                      - 🆓 Free users: 100 samples max
+                      - ⭐ PRO users: 10,000 samples max
+                    - 🔤 **Token Limit:** 8,192 generated tokens per sample
+                    - ⏱️ **Processing Time:** Varies by model size and queue status
+                    **🔒 Privacy & Usage:**
+                    - All outputs are **PUBLIC** on Hugging Face Hub
+                    - Datasets appear under `synthetic-data-universe` organization
+                    - Perfect for research, training data, and open-source projects
                     """)
+            with gr.Accordion("💡 Examples & Use Cases", open=False):
+                gr.Markdown("""
+                **🎯 Popular Use Cases:**
+                **📚 Educational Content Generation**
+                - Input: Questions dataset → Output: Detailed explanations and answers
+                - Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
+                - Temperature: 0.3-0.5 for factual accuracy
+                **💬 Conversational Data**
+                - Input: Conversation starters → Output: Multi-turn dialogues
+                - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
+                - Temperature: 0.7-0.9 for natural variety
+                **🔧 Code Generation**
+                - Input: Problem descriptions → Output: Code solutions with explanations
+                - Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
+                - Temperature: 0.1-0.3 for accurate code
+                **📖 Creative Writing**
+                - Input: Story prompts → Output: Creative narratives
+                - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
+                - Temperature: 0.8-1.2 for creativity
+                **📊 Example Dataset Names to Try:**
+                ```
+                simplescaling/s1K-1.1        # Simple Q&A pairs
+                HuggingFaceH4/ultrachat_200k # Conversations
+                iamtarun/python_code_instructions_18k_alpaca # Code tasks
+                ```
+                """)
             with gr.Tabs():
                 with gr.TabItem("Generate Synthetic Data"):
                     with gr.Row():
                                         )
                                     # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
                                 with gr.Row():
+                                    system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.", info="Sets the AI's role/behavior. Leave empty for default model behavior.")
                                 gr.Markdown("### Generation Parameters")
                                 with gr.Row():
                                     with gr.Column():
                                         with gr.Row():
+                                            max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256, info="Maximum tokens to generate per sample. Higher = longer responses.")
+                                            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1, info="Creativity level: 0.1=focused, 0.7=balanced, 1.0+=creative")
                                         with gr.Row():
+                                            top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5, info="Limits word choices to top K options. Lower = more focused.")
+                                            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05, info="Nucleus sampling: 0.9=focused, 0.95=balanced diversity")
                         with gr.Column():
                             with gr.Group():
                                 user_limit_info = gr.Markdown(value="👤 **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
                                 with gr.Row():
                                     with gr.Column():
+                                        input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1", info="Public HF dataset with prompts to generate from")
                                         load_info_btn = gr.Button("📊 Load Dataset Info", size="sm", variant="secondary")
                                         load_info_status = gr.Markdown("", visible=True)