Spaces:
Paused
Paused
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| def process_with_smolagents(dataset, operation, custom_code=None): | |
| """ | |
| Process dataset using SmolaAgents for various operations. | |
| Args: | |
| dataset: Pandas DataFrame to process | |
| operation: Type of processing operation | |
| custom_code: Custom code to execute (for custom processing) | |
| Returns: | |
| Processed pandas DataFrame | |
| """ | |
| if dataset is None: | |
| raise ValueError("No dataset provided") | |
| # Create a copy to avoid modifying the original | |
| processed_df = dataset.copy() | |
| try: | |
| if operation == "Data Cleaning": | |
| processed_df = clean_dataset(processed_df) | |
| elif operation == "Feature Engineering": | |
| processed_df = engineer_features(processed_df) | |
| elif operation == "Data Transformation": | |
| processed_df = transform_dataset(processed_df) | |
| elif operation == "Custom Processing" and custom_code: | |
| # Execute custom code | |
| # Note: This is a security risk in a real application | |
| # Should be replaced with a safer approach | |
| local_vars = {"df": processed_df} | |
| exec(custom_code, {"pd": pd, "np": np}, local_vars) | |
| processed_df = local_vars["df"] | |
| else: | |
| raise ValueError(f"Unsupported operation: {operation}") | |
| return processed_df | |
| except Exception as e: | |
| st.error(f"Error during processing: {str(e)}") | |
| raise | |
| def clean_dataset(df): | |
| """ | |
| Clean the dataset by handling missing values, duplicates, and outliers. | |
| Args: | |
| df: Pandas DataFrame to clean | |
| Returns: | |
| Cleaned pandas DataFrame | |
| """ | |
| # Create a copy to avoid modifying the original | |
| cleaned_df = df.copy() | |
| # Remove duplicate rows | |
| cleaned_df = cleaned_df.drop_duplicates() | |
| # Handle missing values | |
| for col in cleaned_df.columns: | |
| # For numeric columns | |
| if pd.api.types.is_numeric_dtype(cleaned_df[col]): | |
| # If more than 20% missing, leave as is | |
| if cleaned_df[col].isna().mean() > 0.2: | |
| continue | |
| # Otherwise impute with median | |
| cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median()) | |
| # For categorical columns | |
| elif pd.api.types.is_object_dtype(cleaned_df[col]): | |
| # If more than 20% missing, leave as is | |
| if cleaned_df[col].isna().mean() > 0.2: | |
| continue | |
| # Otherwise impute with mode | |
| mode_value = cleaned_df[col].mode()[0] if not cleaned_df[col].mode().empty else "Unknown" | |
| cleaned_df[col] = cleaned_df[col].fillna(mode_value) | |
| # Handle outliers in numeric columns | |
| for col in cleaned_df.select_dtypes(include=[np.number]).columns: | |
| # Skip if too many missing values | |
| if cleaned_df[col].isna().mean() > 0.1: | |
| continue | |
| # Calculate IQR | |
| q1 = cleaned_df[col].quantile(0.25) | |
| q3 = cleaned_df[col].quantile(0.75) | |
| iqr = q3 - q1 | |
| # Define bounds | |
| lower_bound = q1 - 1.5 * iqr | |
| upper_bound = q3 + 1.5 * iqr | |
| # Cap outliers instead of removing | |
| cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound) | |
| return cleaned_df | |
| def engineer_features(df): | |
| """ | |
| Perform basic feature engineering on the dataset. | |
| Args: | |
| df: Pandas DataFrame to process | |
| Returns: | |
| DataFrame with engineered features | |
| """ | |
| # Create a copy to avoid modifying the original | |
| engineered_df = df.copy() | |
| # Get numeric columns | |
| numeric_cols = engineered_df.select_dtypes(include=[np.number]).columns | |
| # Skip if less than 2 numeric columns | |
| if len(numeric_cols) >= 2: | |
| # Create interaction features for pairs of numeric columns | |
| # Limit to first 5 columns to avoid feature explosion | |
| for i, col1 in enumerate(numeric_cols[:5]): | |
| for col2 in numeric_cols[i+1:5]: | |
| # Product interaction | |
| engineered_df[f"{col1}_{col2}_product"] = engineered_df[col1] * engineered_df[col2] | |
| # Ratio interaction (avoid division by zero) | |
| denominator = engineered_df[col2].replace(0, np.nan) | |
| engineered_df[f"{col1}_{col2}_ratio"] = engineered_df[col1] / denominator | |
| # Create binary features from categorical columns | |
| cat_cols = engineered_df.select_dtypes(include=['object', 'category']).columns | |
| for col in cat_cols: | |
| # Skip if too many unique values (>10) | |
| if engineered_df[col].nunique() > 10: | |
| continue | |
| # One-hot encode | |
| dummies = pd.get_dummies(engineered_df[col], prefix=col, drop_first=True) | |
| engineered_df = pd.concat([engineered_df, dummies], axis=1) | |
| # Create aggregated features | |
| if len(numeric_cols) >= 3: | |
| # Sum of all numeric features | |
| engineered_df['sum_numeric'] = engineered_df[numeric_cols].sum(axis=1) | |
| # Mean of all numeric features | |
| engineered_df['mean_numeric'] = engineered_df[numeric_cols].mean(axis=1) | |
| # Standard deviation of numeric features | |
| engineered_df['std_numeric'] = engineered_df[numeric_cols].std(axis=1) | |
| return engineered_df | |
| def transform_dataset(df): | |
| """ | |
| Perform data transformations on the dataset. | |
| Args: | |
| df: Pandas DataFrame to transform | |
| Returns: | |
| Transformed pandas DataFrame | |
| """ | |
| from sklearn.preprocessing import StandardScaler, MinMaxScaler | |
| # Create a copy to avoid modifying the original | |
| transformed_df = df.copy() | |
| # Get numeric columns | |
| numeric_cols = transformed_df.select_dtypes(include=[np.number]).columns | |
| if len(numeric_cols) > 0: | |
| # Create scaled versions of numeric columns | |
| # Standard scaling (z-score) | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(transformed_df[numeric_cols]) | |
| scaled_df = pd.DataFrame( | |
| scaled_data, | |
| columns=[f"{col}_scaled" for col in numeric_cols], | |
| index=transformed_df.index | |
| ) | |
| # Min-max scaling (0-1 range) | |
| minmax_scaler = MinMaxScaler() | |
| minmax_data = minmax_scaler.fit_transform(transformed_df[numeric_cols]) | |
| minmax_df = pd.DataFrame( | |
| minmax_data, | |
| columns=[f"{col}_normalized" for col in numeric_cols], | |
| index=transformed_df.index | |
| ) | |
| # Log transform (for positive columns only) | |
| log_cols = [] | |
| for col in numeric_cols: | |
| if (transformed_df[col] > 0).all(): | |
| transformed_df[f"{col}_log"] = np.log(transformed_df[col]) | |
| log_cols.append(f"{col}_log") | |
| # Combine all transformations | |
| transformed_df = pd.concat([transformed_df, scaled_df, minmax_df], axis=1) | |
| # One-hot encode categorical columns | |
| cat_cols = transformed_df.select_dtypes(include=['object', 'category']).columns | |
| if len(cat_cols) > 0: | |
| # One-hot encode all categorical columns | |
| transformed_df = pd.get_dummies(transformed_df, columns=cat_cols, drop_first=False) | |
| return transformed_df | |