import torch
from datasets import load_dataset
import traceback
import time


def evaluate_tsac_sentiment(model, tokenizer, device):
    """Evaluate model on TSAC sentiment analysis task"""
    try:
        print("\n=== Starting TSAC sentiment evaluation ===")
        print(f"Current device: {device}")
        
        # Load and preprocess dataset
        print("\nLoading and preprocessing TSAC dataset...")
        dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
        dataset = dataset.select(range(10))  # Only evaluate on 200 samples

        # print(f"Dataset size: {len(dataset)} examples")
        
        def preprocess(examples):
            return tokenizer(
                examples['sentence'], 
                padding=True, 
                truncation=True, 
                max_length=512,
                return_tensors=None
            )
        print(dataset.column_names)
        dataset = dataset.map(preprocess, batched=True)
        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
        
        # Check first example
        first_example = dataset[0]
        print("\nFirst example details:")
        print(f"Input IDs shape: {first_example['input_ids'].shape}")
        print(f"Attention mask shape: {first_example['attention_mask'].shape}")
        print(f"Target: {first_example['target']}")
        
        model.eval()
        print(f"\nModel class: {model.__class__.__name__}")
        print(f"Model device: {next(model.parameters()).device}")
        
        with torch.no_grad():
            predictions = []
            targets = []
            
            # Create DataLoader with batch size 16
            from torch.utils.data import DataLoader
            
            # Define a custom collate function
            def collate_fn(batch):
                input_ids = torch.stack([sample['input_ids'] for sample in batch])
                attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
                targets = torch.stack([sample['target'] for sample in batch])
                return {
                    'input_ids': input_ids,
                    'attention_mask': attention_mask,
                    'target': targets
                }


            dataloader = DataLoader(
                dataset,
                batch_size=16,
                shuffle=False,
                collate_fn=collate_fn
            )
            
            for i, batch in enumerate(dataloader):
                if i % 10 == 0 :
                    print("\nProcessing first batch...")
                    print(f"Batch keys: {list(batch.keys())}")
                    print(f"Target shape: {batch['target'].shape}")
                
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
                target = batch['target'].to(device)
                before = time.time()
                outputs = model(**inputs)
                # print(f"\nBatch {i} output type: {type(outputs)}")
                
                # Handle different model output formats
                if isinstance(outputs, dict):
                    # print(f"Output keys: {list(outputs.keys())}")
                    if 'logits' in outputs:
                        logits = outputs['logits']
                    elif 'prediction_logits' in outputs:
                        logits = outputs['prediction_logits']
                    else:
                        raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
                elif isinstance(outputs, tuple):
                    print(f"Output tuple length: {len(outputs)}")
                    logits = outputs[0]
                else:
                    logits = outputs
                
                # print(f"Logits shape: {logits.shape}")
                
                # For sequence classification, we typically use the [CLS] token's prediction
                if len(logits.shape) == 3:  # [batch_size, sequence_length, num_classes]
                    logits = logits[:, 0, :]  # Take the [CLS] token prediction
                
                # print(f"Final logits shape: {logits.shape}")
                
                batch_predictions = logits.argmax(dim=-1).cpu().tolist()
                batch_targets = target.cpu().tolist()
                
                predictions.extend(batch_predictions)
                targets.extend(batch_targets)
                
                if i % 10 == 0:
                    print("\nFirst batch predictions:")
                    print(f"Predictions: {batch_predictions[:5]}")
                    print(f"Targets: {batch_targets[:5]}")
            
            print(f"\nTotal predictions: {len(predictions)}")
            print(f"Total targets: {len(targets)}")
            
            # Calculate accuracy
            correct = sum(p == t for p, t in zip(predictions, targets))
            total = len(predictions)
            accuracy = correct / total if total > 0 else 0.0
            
            print(f"\nEvaluation results:")
            print(f"Correct predictions: {correct}")
            print(f"Total predictions: {total}")
            print(f"Accuracy: {accuracy:.4f}")
            
            return {"fbougares/tsac": accuracy}
    except Exception as e:
        print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
        print(f"Full traceback: {traceback.format_exc()}")
        raise e