Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Commit 
							
							·
						
						05dfa56
	
1
								Parent(s):
							
							1b57635
								
feat: add Groq provider models and show provider info in UI
Browse files- Add GPT-OSS-20B, GPT-OSS-120B, and Llama-4-Scout-17B models via Groq provider
- Update models_registry.py to support Groq provider with chat.completions.create
- Add provider information to result dictionary in evaluator.py
- Display provider info in both Evaluate and Global Leaderboard tabs
- Group leaderboard by model_name and provider for accurate comparison
- Enable comparison of same model across different providers
- app.py +7 -4
- config/models.yaml +31 -1
- src/evaluator.py +1 -0
- src/models_registry.py +1 -1
    	
        app.py
    CHANGED
    
    | @@ -66,8 +66,11 @@ class LeaderboardManager: | |
| 66 | 
             
                    # Group by model and calculate averages
         | 
| 67 | 
             
                    numeric_columns = ['composite_score', 'correctness_exact', 'result_match_f1', 'exec_success', 'latency_ms']
         | 
| 68 |  | 
| 69 | 
            -
                    # Calculate averages for numeric columns
         | 
| 70 | 
            -
                    model_aggregated = self.leaderboard.groupby('model_name')[numeric_columns].mean().reset_index()
         | 
|  | |
|  | |
|  | |
| 71 |  | 
| 72 | 
             
                    # Sort by composite score (descending) to get proper ranking
         | 
| 73 | 
             
                    model_aggregated = model_aggregated.sort_values('composite_score', ascending=False).reset_index(drop=True)
         | 
| @@ -82,7 +85,7 @@ class LeaderboardManager: | |
| 82 | 
             
                    leaderboard_config = config_loader.get_leaderboard_config()
         | 
| 83 | 
             
                    column_mapping = {
         | 
| 84 | 
             
                        'Rank': 'rank',
         | 
| 85 | 
            -
                        'Model': ' | 
| 86 | 
             
                        'Composite Score': 'composite_score',
         | 
| 87 | 
             
                        'Correctness': 'correctness_exact',
         | 
| 88 | 
             
                        'Result F1': 'result_match_f1',
         | 
| @@ -243,7 +246,7 @@ def run_evaluation(dataset_name: str, dialect: str, case_selection: str, | |
| 243 | 
             
                        # Format for display using config
         | 
| 244 | 
             
                        results.append([
         | 
| 245 | 
             
                            len(results) + 1,  # Rank (1-based)
         | 
| 246 | 
            -
                            model_name,
         | 
| 247 | 
             
                            formatting["composite_score"].format(result['composite_score']),
         | 
| 248 | 
             
                            formatting["correctness_exact"].format(result['correctness_exact']),
         | 
| 249 | 
             
                            formatting["result_match_f1"].format(result['result_match_f1']),
         | 
|  | |
| 66 | 
             
                    # Group by model and calculate averages
         | 
| 67 | 
             
                    numeric_columns = ['composite_score', 'correctness_exact', 'result_match_f1', 'exec_success', 'latency_ms']
         | 
| 68 |  | 
| 69 | 
            +
                    # Calculate averages for numeric columns, keeping provider info
         | 
| 70 | 
            +
                    model_aggregated = self.leaderboard.groupby(['model_name', 'provider'])[numeric_columns].mean().reset_index()
         | 
| 71 | 
            +
                    
         | 
| 72 | 
            +
                    # Create combined model name with provider
         | 
| 73 | 
            +
                    model_aggregated['model_display'] = model_aggregated['model_name'] + ' (' + model_aggregated['provider'] + ')'
         | 
| 74 |  | 
| 75 | 
             
                    # Sort by composite score (descending) to get proper ranking
         | 
| 76 | 
             
                    model_aggregated = model_aggregated.sort_values('composite_score', ascending=False).reset_index(drop=True)
         | 
|  | |
| 85 | 
             
                    leaderboard_config = config_loader.get_leaderboard_config()
         | 
| 86 | 
             
                    column_mapping = {
         | 
| 87 | 
             
                        'Rank': 'rank',
         | 
| 88 | 
            +
                        'Model': 'model_display',
         | 
| 89 | 
             
                        'Composite Score': 'composite_score',
         | 
| 90 | 
             
                        'Correctness': 'correctness_exact',
         | 
| 91 | 
             
                        'Result F1': 'result_match_f1',
         | 
|  | |
| 246 | 
             
                        # Format for display using config
         | 
| 247 | 
             
                        results.append([
         | 
| 248 | 
             
                            len(results) + 1,  # Rank (1-based)
         | 
| 249 | 
            +
                            f"{model_name} ({result['provider']})",  # Include provider in model name
         | 
| 250 | 
             
                            formatting["composite_score"].format(result['composite_score']),
         | 
| 251 | 
             
                            formatting["correctness_exact"].format(result['correctness_exact']),
         | 
| 252 | 
             
                            formatting["result_match_f1"].format(result['result_match_f1']),
         | 
    	
        config/models.yaml
    CHANGED
    
    | @@ -37,4 +37,34 @@ models: | |
| 37 | 
             
                  max_new_tokens: 256
         | 
| 38 | 
             
                  temperature: 0.1
         | 
| 39 | 
             
                  top_p: 0.9
         | 
| 40 | 
            -
                description: "DeepSeek-R1 - DeepSeek's reasoning model via Together AI"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 37 | 
             
                  max_new_tokens: 256
         | 
| 38 | 
             
                  temperature: 0.1
         | 
| 39 | 
             
                  top_p: 0.9
         | 
| 40 | 
            +
                description: "DeepSeek-R1 - DeepSeek's reasoning model via Together AI"
         | 
| 41 | 
            +
             | 
| 42 | 
            +
              # GPT-OSS-20B with Groq Provider
         | 
| 43 | 
            +
              - name: "GPT-OSS-20B (Groq)"
         | 
| 44 | 
            +
                provider: "groq"
         | 
| 45 | 
            +
                model_id: "openai/gpt-oss-20b"
         | 
| 46 | 
            +
                params:
         | 
| 47 | 
            +
                  max_new_tokens: 256
         | 
| 48 | 
            +
                  temperature: 0.1
         | 
| 49 | 
            +
                  top_p: 0.9
         | 
| 50 | 
            +
                description: "GPT-OSS-20B - OpenAI's 20B parameter model via Groq"
         | 
| 51 | 
            +
             | 
| 52 | 
            +
              # GPT-OSS-120B with Groq Provider
         | 
| 53 | 
            +
              - name: "GPT-OSS-120B (Groq)"
         | 
| 54 | 
            +
                provider: "groq"
         | 
| 55 | 
            +
                model_id: "openai/gpt-oss-120b"
         | 
| 56 | 
            +
                params:
         | 
| 57 | 
            +
                  max_new_tokens: 256
         | 
| 58 | 
            +
                  temperature: 0.1
         | 
| 59 | 
            +
                  top_p: 0.9
         | 
| 60 | 
            +
                description: "GPT-OSS-120B - OpenAI's 120B parameter model via Groq"
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              # Llama-4-Scout-17B with Groq Provider
         | 
| 63 | 
            +
              - name: "Llama-4-Scout-17B (Groq)"
         | 
| 64 | 
            +
                provider: "groq"
         | 
| 65 | 
            +
                model_id: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
         | 
| 66 | 
            +
                params:
         | 
| 67 | 
            +
                  max_new_tokens: 256
         | 
| 68 | 
            +
                  temperature: 0.1
         | 
| 69 | 
            +
                  top_p: 0.9
         | 
| 70 | 
            +
                description: "Llama-4-Scout-17B - Meta's latest multimodal model via Groq"
         | 
    	
        src/evaluator.py
    CHANGED
    
    | @@ -359,6 +359,7 @@ class Evaluator: | |
| 359 |  | 
| 360 | 
             
                    return {
         | 
| 361 | 
             
                        'model_name': model_name,
         | 
|  | |
| 362 | 
             
                        'dataset_name': dataset_name,
         | 
| 363 | 
             
                        'case_id': case_id,
         | 
| 364 | 
             
                        'dialect': dialect,
         | 
|  | |
| 359 |  | 
| 360 | 
             
                    return {
         | 
| 361 | 
             
                        'model_name': model_name,
         | 
| 362 | 
            +
                        'provider': model_config.provider,
         | 
| 363 | 
             
                        'dataset_name': dataset_name,
         | 
| 364 | 
             
                        'case_id': case_id,
         | 
| 365 | 
             
                        'dialect': dialect,
         | 
    	
        src/models_registry.py
    CHANGED
    
    | @@ -86,7 +86,7 @@ class HuggingFaceInference: | |
| 86 | 
             
                        )
         | 
| 87 |  | 
| 88 | 
             
                        # Use different methods based on provider capabilities
         | 
| 89 | 
            -
                        if provider == "nebius" or provider == "together":
         | 
| 90 | 
             
                            # Nebius provider only supports conversational tasks, use chat completion
         | 
| 91 | 
             
                            completion = client.chat.completions.create(
         | 
| 92 | 
             
                                model=model_id,
         | 
|  | |
| 86 | 
             
                        )
         | 
| 87 |  | 
| 88 | 
             
                        # Use different methods based on provider capabilities
         | 
| 89 | 
            +
                        if provider == "nebius" or provider == "together" or provider == "groq":
         | 
| 90 | 
             
                            # Nebius provider only supports conversational tasks, use chat completion
         | 
| 91 | 
             
                            completion = client.chat.completions.create(
         | 
| 92 | 
             
                                model=model_id,
         |