File size: 16,637 Bytes
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f89b28b
 
 
 
 
 
 
 
 
 
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f89b28b
 
cfea739
 
f89b28b
 
 
 
cfea739
 
 
 
 
f89b28b
 
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610152e
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
610152e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfea739
 
 
 
610152e
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610152e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# cams_downloader.py
# Download CAMS atmospheric composition data

import cdsapi
import zipfile
import os
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd

class CAMSDownloader:
    def __init__(self, download_dir="downloads"):
        """
        Initialize CAMS downloader
        
        Parameters:
        download_dir (str): Directory to store downloaded files
        """
        self.download_dir = Path(download_dir)
        self.download_dir.mkdir(exist_ok=True)
        
        # Create subdirectories
        self.extracted_dir = self.download_dir / "extracted"
        self.extracted_dir.mkdir(exist_ok=True)
        
        self.client = None
        self._init_client()
    
    def _init_client(self):
        """Initialize CDS API client"""
        try:
            # First, try environment variables (preferred for cloud deployments)
            cdsapi_url = os.getenv('CDSAPI_URL')
            cdsapi_key = os.getenv('CDSAPI_KEY')
            
            if cdsapi_url and cdsapi_key:
                self.client = cdsapi.Client(key=cdsapi_key, url=cdsapi_url)
                print("βœ… CDS API client initialized from environment variables")
                return
            
            # Fallback: Try to read .cdsapirc file from current directory first, then home directory
            cdsapirc_path = Path.cwd() / ".cdsapirc"
            if not cdsapirc_path.exists():
                cdsapirc_path = Path.home() / ".cdsapirc"
            
            if cdsapirc_path.exists():
                # Parse credentials from .cdsapirc
                with open(cdsapirc_path, 'r') as f:
                    lines = f.readlines()
                
                url = None
                key = None
                for line in lines:
                    line = line.strip()
                    if line.startswith('url:'):
                        url = line.split(':', 1)[1].strip()
                    elif line.startswith('key:'):
                        key = line.split(':', 1)[1].strip()
                
                if url and key:
                    self.client = cdsapi.Client(key=key, url=url)
                    print("βœ… CDS API client initialized from .cdsapirc file")
                    return
                else:
                    raise ValueError("Could not parse URL or key from .cdsapirc file")
            
            # Last resort: Try default initialization
            self.client = cdsapi.Client()
            print("βœ… CDS API client initialized with default settings")
                
        except Exception as e:
            print(f"⚠️  Warning: Could not initialize CDS API client: {str(e)}")
            print("Please ensure you have:")
            print("1. Created an account at https://cds.climate.copernicus.eu/")
            print("2. Set CDSAPI_URL and CDSAPI_KEY environment variables (recommended for cloud deployments)")
            print("3. Or created a .cdsapirc file in your home directory with your credentials")
            self.client = None
    
    def is_client_ready(self):
        """Check if CDS API client is ready"""
        return self.client is not None
    
    def download_cams_data(self, date_str, variables=None, pressure_levels=None):
        """
        Download CAMS atmospheric composition data for a specific date
        
        Parameters:
        date_str (str): Date in YYYY-MM-DD format
        variables (list): List of variables to download (default: common air pollution variables)
        pressure_levels (list): List of pressure levels (default: standard levels)
        
        Returns:
        str: Path to downloaded ZIP file
        """
        if not self.is_client_ready():
            raise Exception("CDS API client not initialized. Please check your credentials.")
        
        # Validate date
        try:
            target_date = pd.to_datetime(date_str)
            date_str = target_date.strftime('%Y-%m-%d')
        except:
            raise ValueError(f"Invalid date format: {date_str}. Use YYYY-MM-DD format.")
        
        # Check if data already exists
        filename = f"{date_str}-cams.nc.zip"
        filepath = self.download_dir / filename
        
        if filepath.exists():
            print(f"βœ… Data for {date_str} already exists: {filename}")
            return str(filepath)
        
        # Default variables (common air pollution variables)
        if variables is None:
            variables = [
                # Meteorological surface-level variables
                "10m_u_component_of_wind",
                "10m_v_component_of_wind", 
                "2m_temperature",
                "mean_sea_level_pressure",
                
                # Pollution surface-level variables
                "particulate_matter_1um",
                "particulate_matter_2.5um", 
                "particulate_matter_10um",
                "total_column_carbon_monoxide",
                "total_column_nitrogen_monoxide",
                "total_column_nitrogen_dioxide",
                "total_column_ozone",
                "total_column_sulphur_dioxide",
                
                # Meteorological atmospheric variables
                "u_component_of_wind",
                "v_component_of_wind",
                "temperature", 
                "geopotential",
                "specific_humidity",
                
                # Pollution atmospheric variables
                "carbon_monoxide",
                "nitrogen_dioxide",
                "nitrogen_monoxide", 
                "ozone",
                "sulphur_dioxide",
            ]
        
        # Default pressure levels
        if pressure_levels is None:
            pressure_levels = [
                "50", "100", "150", "200", "250", "300", "400",
                "500", "600", "700", "850", "925", "1000",
            ]
        
        print(f"πŸ”„ Downloading CAMS data for {date_str}...")
        print(f"Variables: {len(variables)} selected")
        print(f"Pressure levels: {len(pressure_levels)} levels")
        
        try:
            # Make the API request
            print("πŸ“‘ Requesting data from CAMS API...")
            self.client.retrieve(
                "cams-global-atmospheric-composition-forecasts",
                {
                    "type": "forecast",
                    "leadtime_hour": "0",
                    "variable": variables,
                    "pressure_level": pressure_levels,
                    "date": date_str,
                    "time": ["00:00", "12:00"],  # Two time steps
                    "format": "netcdf_zip",
                },
                str(filepath),
            )
            
            # Validate the downloaded file
            if filepath.exists():
                file_size = filepath.stat().st_size
                print(f"πŸ“ Downloaded file size: {file_size / 1024 / 1024:.2f} MB")
                
                # Basic validation - CAMS files should be reasonably large
                if file_size < 10000:  # Less than 10KB is suspicious
                    print(f"⚠️  Warning: Downloaded file is very small ({file_size} bytes)")
                    # Read first few bytes to check for error messages
                    with open(filepath, 'rb') as f:
                        header = f.read(200)
                        if b'error' in header.lower() or b'html' in header.lower():
                            filepath.unlink()
                            raise Exception("CAMS API returned an error response instead of data")
                
                print(f"βœ… Successfully downloaded: {filename}")
                return str(filepath)
            else:
                raise Exception("Download completed but file was not created")
            
        except Exception as e:
            # Clean up partial download
            if filepath.exists():
                print(f"πŸ—‘οΈ  Cleaning up failed download: {filepath}")
                filepath.unlink()
            raise Exception(f"Error downloading CAMS data: {str(e)}")
    
    def extract_cams_files(self, zip_path):
        """
        Extract surface and atmospheric data from CAMS ZIP file
        
        Parameters:
        zip_path (str): Path to CAMS ZIP file
        
        Returns:
        dict: Paths to extracted files
        """
        zip_path = Path(zip_path)
        if not zip_path.exists():
            raise FileNotFoundError(f"ZIP file not found: {zip_path}")
        
        # Validate file is actually a ZIP file
        try:
            # Check file size first
            file_size = zip_path.stat().st_size
            if file_size < 1000:  # Less than 1KB is probably an error response
                print(f"⚠️  Downloaded file is too small ({file_size} bytes), likely an error response")
                # Try to read first few bytes to see what we got
                with open(zip_path, 'rb') as f:
                    header = f.read(100)
                    if b'html' in header.lower() or b'error' in header.lower():
                        raise Exception("Downloaded file appears to be an HTML error page, not ZIP data")
                
            # Test if it's a valid ZIP file
            if not zipfile.is_zipfile(zip_path):
                print(f"❌ File is not a valid ZIP file: {zip_path}")
                # Try to read first few lines to diagnose
                with open(zip_path, 'r', errors='ignore') as f:
                    first_lines = f.read(200)
                    print(f"File contents preview: {first_lines[:100]}...")
                raise Exception(f"Downloaded file is not a valid ZIP archive. File size: {file_size} bytes")
                
        except Exception as e:
            if "ZIP" in str(e) or "zip" in str(e):
                raise e
            else:
                raise Exception(f"Error validating ZIP file: {str(e)}")
        
        # Extract date from filename
        date_str = zip_path.stem.replace("-cams.nc", "")
        
        surface_path = self.extracted_dir / f"{date_str}-cams-surface.nc"
        atmospheric_path = self.extracted_dir / f"{date_str}-cams-atmospheric.nc"
        
        extracted_files = {}
        
        try:
            with zipfile.ZipFile(zip_path, "r") as zf:
                zip_contents = zf.namelist()
                
                # Extract surface data
                surface_file = None
                for file in zip_contents:
                    if 'sfc' in file.lower() or file.endswith('_sfc.nc'):
                        surface_file = file
                        break
                
                if surface_file and not surface_path.exists():
                    with open(surface_path, "wb") as f:
                        f.write(zf.read(surface_file))
                    print(f"βœ… Extracted surface data: {surface_path.name}")
                    extracted_files['surface'] = str(surface_path)
                elif surface_path.exists():
                    extracted_files['surface'] = str(surface_path)
                
                # Extract atmospheric data  
                atmospheric_file = None
                for file in zip_contents:
                    if 'plev' in file.lower() or file.endswith('_plev.nc'):
                        atmospheric_file = file
                        break
                
                if atmospheric_file and not atmospheric_path.exists():
                    with open(atmospheric_path, "wb") as f:
                        f.write(zf.read(atmospheric_file))
                    print(f"βœ… Extracted atmospheric data: {atmospheric_path.name}")
                    extracted_files['atmospheric'] = str(atmospheric_path)
                elif atmospheric_path.exists():
                    extracted_files['atmospheric'] = str(atmospheric_path)
                
                # If no specific files found, extract all .nc files
                if not extracted_files:
                    nc_files = [f for f in zip_contents if f.endswith('.nc')]
                    for nc_file in nc_files:
                        output_path = self.extracted_dir / nc_file
                        if not output_path.exists():
                            with open(output_path, "wb") as f:
                                f.write(zf.read(nc_file))
                            extracted_files[nc_file] = str(output_path)
        
        except Exception as e:
            raise Exception(f"Error extracting ZIP file: {str(e)}")
        
        if not extracted_files:
            raise Exception("No NetCDF files found in ZIP archive")
        
        return extracted_files
    
    def get_available_dates(self, start_date=None, end_date=None):
        """
        Get list of dates for which CAMS data is typically available
        Note: This doesn't check actual availability, just generates reasonable date range
        
        Parameters:
        start_date (str): Start date (default: 30 days ago)
        end_date (str): End date (default: yesterday)
        
        Returns:
        list: List of date strings in YYYY-MM-DD format
        """
        if start_date is None:
            start_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
        
        if end_date is None:
            end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        
        # Generate date range
        date_range = pd.date_range(start=start_date, end=end_date, freq='D')
        return [date.strftime('%Y-%m-%d') for date in date_range]
    
    def list_downloaded_files(self):
        """List all downloaded CAMS files"""
        downloaded_files = []
        
        for zip_file in self.download_dir.glob("*-cams.nc.zip"):
            date_str = zip_file.stem.replace("-cams.nc", "")
            file_info = {
                'date': date_str,
                'zip_path': str(zip_file),
                'size_mb': zip_file.stat().st_size / (1024 * 1024),
                'downloaded': zip_file.stat().st_mtime
            }
            downloaded_files.append(file_info)
        
        # Sort by date (newest first)
        downloaded_files.sort(key=lambda x: x['date'], reverse=True)
        return downloaded_files
    
    def cleanup_old_files(self, days_old=30):
        """
        Clean up downloaded files older than specified days
        
        Parameters:
        days_old (int): Delete files older than this many days
        """
        try:
            cutoff_date = datetime.now() - timedelta(days=days_old)
            
            deleted_count = 0
            for zip_file in self.download_dir.glob("*-cams.nc.zip"):
                if datetime.fromtimestamp(zip_file.stat().st_mtime) < cutoff_date:
                    zip_file.unlink()
                    deleted_count += 1
            
            # Also clean extracted files
            for nc_file in self.extracted_dir.glob("*.nc"):
                if datetime.fromtimestamp(nc_file.stat().st_mtime) < cutoff_date:
                    nc_file.unlink()
                    deleted_count += 1
            
            print(f"🧹 Cleaned up {deleted_count} old files")
            return deleted_count
            
        except Exception as e:
            print(f"Error during cleanup: {str(e)}")
            return 0


def test_cams_downloader():
    """Test function for CAMS downloader"""
    print("Testing CAMS downloader...")
    
    downloader = CAMSDownloader()
    
    if not downloader.is_client_ready():
        print("❌ CDS API client not ready. Please check your credentials.")
        return False
    
    # Test with recent date
    test_date = (datetime.now() - timedelta(days=600)).strftime('%Y-%m-%d')
    
    print(f"Testing download for date: {test_date}")
    print("⚠️  This may take several minutes for the first download...")
    
    try:
        # Download data (will skip if already exists)
        zip_path = downloader.download_cams_data(test_date)
        print(f"βœ… Download successful: {zip_path}")
        
        # Test extraction
        extracted_files = downloader.extract_cams_files(zip_path)
        print(f"βœ… Extraction successful: {len(extracted_files)} files")
        
        # List downloaded files
        downloaded = downloader.list_downloaded_files()
        print(f"βœ… Found {len(downloaded)} downloaded files")
        
        return True
        
    except Exception as e:
        print(f"❌ Test failed: {str(e)}")
        return False


if __name__ == "__main__":
    test_cams_downloader()