File size: 7,573 Bytes
3d0e99a
6e7ce87
3d0e99a
 
 
 
 
 
 
 
 
 
 
6e7ce87
 
 
 
3d0e99a
 
6e7ce87
3d0e99a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e7ce87
3d0e99a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e7ce87
3d0e99a
 
 
 
 
 
 
6e7ce87
3d0e99a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# space/utils/config.py
import os
import logging
from typing import Optional
from dataclasses import dataclass, field

logger = logging.getLogger(__name__)


class ConfigError(Exception):
    """Custom exception for configuration errors."""
    pass


@dataclass
class AppConfig:
    """
    Application configuration loaded from environment variables.
    Includes validation and sensible defaults.
    """
    
    # SQL Backend Configuration
    sql_backend: str = "motherduck"  # "bigquery" or "motherduck"
    gcp_project: Optional[str] = None
    motherduck_token: Optional[str] = None
    motherduck_db: str = "workspace"
    
    # Model Configuration
    hf_model_repo: str = "your-org/your-model"
    hf_token: Optional[str] = None
    
    # Tracing Configuration
    trace_enabled: bool = True
    trace_url: Optional[str] = None
    
    # Feature Flags
    enable_forecasting: bool = True
    enable_explanations: bool = True
    
    # Performance Settings
    max_workers: int = 4
    timeout_seconds: int = 300
    
    # Additional settings
    log_level: str = "INFO"
    
    def __post_init__(self):
        """Validate configuration after initialization."""
        self._validate()
    
    def _validate(self):
        """Validate configuration values."""
        # Validate SQL backend
        valid_backends = ["bigquery", "motherduck"]
        if self.sql_backend not in valid_backends:
            raise ConfigError(
                f"Invalid sql_backend: {self.sql_backend}. "
                f"Must be one of: {valid_backends}"
            )
        
        # Validate backend-specific requirements
        if self.sql_backend == "bigquery":
            if not self.gcp_project:
                logger.warning("BigQuery selected but gcp_project not set")
        
        if self.sql_backend == "motherduck":
            if not self.motherduck_token:
                logger.warning("MotherDuck selected but motherduck_token not set")
        
        # Validate model configuration
        if not self.hf_model_repo:
            logger.warning("hf_model_repo not set - predictions/explanations will fail")
        
        # Validate numeric settings
        if self.max_workers < 1:
            raise ConfigError(f"max_workers must be >= 1, got {self.max_workers}")
        
        if self.timeout_seconds < 1:
            raise ConfigError(f"timeout_seconds must be >= 1, got {self.timeout_seconds}")
        
        # Validate log level
        valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
        if self.log_level.upper() not in valid_levels:
            raise ConfigError(
                f"Invalid log_level: {self.log_level}. "
                f"Must be one of: {valid_levels}"
            )
    
    @classmethod
    def from_env(cls) -> "AppConfig":
        """
        Create configuration from environment variables.
        
        Environment variables:
            SQL_BACKEND: "bigquery" or "motherduck" (default: "motherduck")
            GCP_PROJECT: GCP project ID for BigQuery
            GCP_SERVICE_ACCOUNT_JSON: Service account credentials for BigQuery
            MOTHERDUCK_TOKEN: MotherDuck authentication token
            MOTHERDUCK_DB: MotherDuck database name (default: "workspace")
            HF_MODEL_REPO: HuggingFace model repository (required)
            HF_TOKEN: HuggingFace API token (optional, for private repos)
            TRACE_ENABLED: Enable tracing (default: "true")
            TRACE_URL: Custom trace URL
            ENABLE_FORECASTING: Enable forecasting features (default: "true")
            ENABLE_EXPLANATIONS: Enable SHAP explanations (default: "true")
            MAX_WORKERS: Max parallel workers (default: 4)
            TIMEOUT_SECONDS: Request timeout (default: 300)
            LOG_LEVEL: Logging level (default: "INFO")
        """
        try:
            config = cls(
                sql_backend=os.getenv("SQL_BACKEND", "motherduck").lower(),
                gcp_project=os.getenv("GCP_PROJECT"),
                motherduck_token=os.getenv("MOTHERDUCK_TOKEN"),
                motherduck_db=os.getenv("MOTHERDUCK_DB", "workspace"),
                hf_model_repo=os.getenv("HF_MODEL_REPO", "your-org/your-model"),
                hf_token=os.getenv("HF_TOKEN"),
                trace_enabled=os.getenv("TRACE_ENABLED", "true").lower() == "true",
                trace_url=os.getenv("TRACE_URL"),
                enable_forecasting=os.getenv("ENABLE_FORECASTING", "true").lower() == "true",
                enable_explanations=os.getenv("ENABLE_EXPLANATIONS", "true").lower() == "true",
                max_workers=int(os.getenv("MAX_WORKERS", "4")),
                timeout_seconds=int(os.getenv("TIMEOUT_SECONDS", "300")),
                log_level=os.getenv("LOG_LEVEL", "INFO").upper()
            )
            
            logger.info("Configuration loaded successfully")
            logger.info(f"SQL Backend: {config.sql_backend}")
            logger.info(f"Model Repo: {config.hf_model_repo}")
            logger.info(f"Forecasting: {'enabled' if config.enable_forecasting else 'disabled'}")
            logger.info(f"Explanations: {'enabled' if config.enable_explanations else 'disabled'}")
            
            return config
            
        except ValueError as e:
            raise ConfigError(f"Invalid numeric configuration value: {e}") from e
        except Exception as e:
            raise ConfigError(f"Configuration loading failed: {e}") from e
    
    def to_dict(self) -> dict:
        """Convert configuration to dictionary (for logging/debugging)."""
        return {
            "sql_backend": self.sql_backend,
            "gcp_project": self.gcp_project or "not set",
            "motherduck_db": self.motherduck_db,
            "hf_model_repo": self.hf_model_repo,
            "hf_token_set": bool(self.hf_token),
            "trace_enabled": self.trace_enabled,
            "enable_forecasting": self.enable_forecasting,
            "enable_explanations": self.enable_explanations,
            "max_workers": self.max_workers,
            "timeout_seconds": self.timeout_seconds,
            "log_level": self.log_level
        }
    
    def validate_for_features(self, features: list) -> tuple[bool, list]:
        """
        Validate configuration supports requested features.
        
        Args:
            features: List of feature names to check
            
        Returns:
            Tuple of (all_valid, list_of_errors)
        """
        errors = []
        
        for feature in features:
            if feature == "predict" or feature == "explain":
                if not self.hf_model_repo or self.hf_model_repo == "your-org/your-model":
                    errors.append(f"{feature} requires valid HF_MODEL_REPO")
            
            elif feature == "forecast":
                if not self.enable_forecasting:
                    errors.append("forecasting is disabled (ENABLE_FORECASTING=false)")
            
            elif feature == "explain":
                if not self.enable_explanations:
                    errors.append("explanations are disabled (ENABLE_EXPLANATIONS=false)")
            
            elif feature == "sql":
                if self.sql_backend == "bigquery" and not self.gcp_project:
                    errors.append("BigQuery requires GCP_PROJECT")
                elif self.sql_backend == "motherduck" and not self.motherduck_token:
                    errors.append("MotherDuck requires MOTHERDUCK_TOKEN")
        
        return len(errors) == 0, errors