AbhayVG commited on
Commit
65c5351
·
verified ·
1 Parent(s): 3205f8e
Files changed (1) hide show
  1. src.py +227 -399
src.py CHANGED
@@ -20,19 +20,23 @@ hf_token = os.getenv("HF_TOKEN")
20
  gemini_token = os.getenv("GEMINI_TOKEN")
21
 
22
  # Debug print (remove in production)
23
- print(f"Debug - Groq Token: {'Present' if Groq_Token else 'Missing'}")
24
- print(f"Debug - Groq Token Value: {Groq_Token[:10] + '...' if Groq_Token else 'None'}")
25
- print(f"Debug - Gemini Token: {'Present' if gemini_token else 'Missing'}")
26
 
27
  models = {
28
- "gpt-oss-20b": "openai/gpt-oss-20b",
29
  "gpt-oss-120b": "openai/gpt-oss-120b",
30
- "llama3.1": "llama-3.1-8b-instant",
 
31
  "llama3.3": "llama-3.3-70b-versatile",
32
  "deepseek-R1": "deepseek-r1-distill-llama-70b",
33
- "llama4 maverik":"meta-llama/llama-4-maverick-17b-128e-instruct",
34
- "llama4 scout":"meta-llama/llama-4-scout-17b-16e-instruct",
35
- "gemini-pro": "gemini-1.5-pro"
 
 
 
 
36
  }
37
 
38
  def log_interaction(user_query, model_name, response_content, generated_code, execution_time, error_message=None, is_image=False):
@@ -96,159 +100,157 @@ def preprocess_and_load_df(path: str) -> pd.DataFrame:
96
  raise Exception(f"Error loading dataframe: {e}")
97
 
98
 
99
-
100
  def get_from_user(prompt):
101
  """Format user prompt"""
102
  return {"role": "user", "content": prompt}
103
 
104
 
105
-
106
-
107
  def ask_question(model_name, question):
108
  """Ask question with comprehensive error handling and logging"""
109
  start_time = datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  try:
111
- # Reload environment variables to get fresh values
112
- load_dotenv(override=True)
113
- fresh_groq_token = os.getenv("GROQ_API_KEY")
114
- fresh_gemini_token = os.getenv("GEMINI_TOKEN")
115
-
116
- print(f"ask_question - Fresh Groq Token: {'Present' if fresh_groq_token else 'Missing'}")
117
-
118
- # Check API availability with fresh tokens
119
- if model_name == "gemini-pro":
120
- if not fresh_gemini_token or fresh_gemini_token.strip() == "":
121
- execution_time = (datetime.now() - start_time).total_seconds()
122
- error_msg = "Missing or empty API token"
123
-
124
- # Log the failed interaction
125
- log_interaction(
126
- user_query=question,
127
- model_name=model_name,
128
- response_content="Gemini API token not available or empty",
129
- generated_code="",
130
- execution_time=execution_time,
131
- error_message=error_msg,
132
- is_image=False
133
- )
134
-
135
- return {
136
- "role": "assistant",
137
- "content": "Gemini API token not available or empty. Please set GEMINI_TOKEN in your environment variables.",
138
- "gen_code": "",
139
- "ex_code": "",
140
- "last_prompt": question,
141
- "error": error_msg
142
- }
143
- llm = ChatGoogleGenerativeAI(
144
- model=models[model_name],
145
- google_api_key=fresh_gemini_token,
146
- temperature=0
147
  )
148
- else:
149
- if not fresh_groq_token or fresh_groq_token.strip() == "":
150
- execution_time = (datetime.now() - start_time).total_seconds()
151
- error_msg = "Missing or empty API token"
152
-
153
- # Log the failed interaction
154
- log_interaction(
155
- user_query=question,
156
- model_name=model_name,
157
- response_content="Groq API token not available or empty",
158
- generated_code="",
159
- execution_time=execution_time,
160
- error_message=error_msg,
161
- is_image=False
162
- )
163
-
164
- return {
165
- "role": "assistant",
166
- "content": "Groq API token not available or empty. Please set GROQ_API_KEY in your environment variables and restart the application.",
167
- "gen_code": "",
168
- "ex_code": "",
169
- "last_prompt": question,
170
- "error": error_msg
171
- }
172
 
173
- # Test the API key by trying to create the client
174
  try:
175
- llm = ChatGroq(
176
- model=models[model_name],
177
- api_key=fresh_groq_token,
178
- temperature=0.1
179
  )
180
- # Test with a simple call to verify the API key works
181
- test_response = llm.invoke("Test")
182
- print("API key test successful")
183
  except Exception as api_error:
184
- execution_time = (datetime.now() - start_time).total_seconds()
185
- error_msg = str(api_error)
186
-
187
- if "organization_restricted" in error_msg.lower() or "unauthorized" in error_msg.lower():
188
- response_content = "API Key Error: Your Groq API key appears to be invalid, expired, or restricted. Please check your API key in the .env file."
189
- log_error_msg = f"API key validation failed: {error_msg}"
190
- else:
191
- response_content = f"API Connection Error: {error_msg}"
192
- log_error_msg = error_msg
193
-
194
- # Log the failed interaction
195
- log_interaction(
196
- user_query=question,
197
- model_name=model_name,
198
- response_content=response_content,
199
- generated_code="",
200
- execution_time=execution_time,
201
- error_message=log_error_msg,
202
- is_image=False
203
  )
204
-
205
- return {
206
- "role": "assistant",
207
- "content": response_content,
208
- "gen_code": "",
209
- "ex_code": "",
210
- "last_prompt": question,
211
- "error": log_error_msg
212
- }
213
 
214
- # Check if data file exists
215
- if not os.path.exists("Data.csv"):
216
- execution_time = (datetime.now() - start_time).total_seconds()
217
- error_msg = "Data file not found"
218
-
219
- # Log the failed interaction
220
- log_interaction(
221
- user_query=question,
222
- model_name=model_name,
223
- response_content="Data.csv file not found",
224
- generated_code="",
225
- execution_time=execution_time,
226
- error_message=error_msg,
227
- is_image=False
228
  )
229
-
230
- return {
231
- "role": "assistant",
232
- "content": "Data.csv file not found. Please ensure the data file is in the correct location.",
233
- "gen_code": "",
234
- "ex_code": "",
235
- "last_prompt": question,
236
- "error": error_msg
237
- }
238
-
239
- df_check = pd.read_csv("Data.csv")
240
- df_check["Timestamp"] = pd.to_datetime(df_check["Timestamp"])
241
- df_check = df_check.head(5)
242
 
243
- new_line = "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- template = f"""```python
 
 
 
 
 
 
 
246
  import pandas as pd
247
  import matplotlib.pyplot as plt
248
  import uuid
249
  import calendar
250
  import numpy as np
251
-
252
  # Set professional matplotlib styling
253
  plt.rcParams.update({{
254
  'font.size': 12,
@@ -272,285 +274,111 @@ plt.rcParams.update({{
272
  'figure.figsize': [12, 6],
273
  'axes.prop_cycle': plt.cycler('color', ['#3b82f6', '#ef4444', '#10b981', '#f59e0b', '#8b5cf6', '#06b6d4'])
274
  }})
275
-
276
- df = pd.read_csv("Data.csv")
277
  df["Timestamp"] = pd.to_datetime(df["Timestamp"])
278
-
279
- # Available columns and data types:
280
- {new_line.join(map(lambda x: '# '+x, str(df_check.dtypes).split(new_line)))}
281
-
 
 
 
 
282
  # Question: {question.strip()}
283
  # Generate code to answer the question and save result in 'answer' variable
284
  # If creating a plot, save it with a unique filename and store the filename in 'answer'
285
  # If returning text/numbers, store the result directly in 'answer'
286
  ```"""
287
 
288
- system_prompt = """Generate Python code to answer the user's question about air quality data.
289
-
290
- CRITICAL: Only generate Python code - no explanations, no thinking, just clean executable code.
291
-
292
- AVAILABLE LIBRARIES:
293
- You can use these pre-installed libraries:
294
- - pandas, numpy (data manipulation)
295
- - matplotlib, seaborn, plotly (visualization)
296
- - statsmodels (statistical modeling, trend analysis)
297
- - scikit-learn (machine learning, regression)
298
- - geopandas (geospatial analysis)
299
-
300
- LIBRARY USAGE RULES:
301
- - For trend analysis: Use numpy.polyfit(x, y, 1) for simple linear trends
302
- - For regression: Use sklearn.linear_model.LinearRegression() for robust regression
303
- - For statistical modeling: Use statsmodels only if needed, otherwise use numpy/sklearn
304
- - Always import libraries at the top: import numpy as np, from sklearn.linear_model import LinearRegression
305
- - Handle missing libraries gracefully with try-except around imports
306
-
307
- OUTPUT TYPE REQUIREMENTS:
308
- 1. PLOT GENERATION (for "plot", "chart", "visualize", "show trend", "graph"):
309
- - MUST create matplotlib figure with proper labels, title, legend
310
- - MUST save plot: filename = f"plot_{uuid.uuid4().hex[:8]}.png"
311
- - MUST call plt.savefig(filename, dpi=300, bbox_inches='tight')
312
- - MUST call plt.close() to prevent memory leaks
313
- - MUST store filename in 'answer' variable: answer = filename
314
- - Handle empty data gracefully before plotting
315
-
316
- 2. TEXT ANSWERS (for simple "Which", "What", single values):
317
- - Store direct string answer in 'answer' variable
318
- - Example: answer = "December had the highest pollution"
319
-
320
- 3. DATAFRAMES (for lists, rankings, comparisons, multiple results):
321
- - Create clean DataFrame with descriptive column names
322
- - Sort appropriately for readability
323
- - Store DataFrame in 'answer' variable: answer = result_df
324
-
325
- MANDATORY SAFETY & ROBUSTNESS RULES:
326
-
327
- DATA VALIDATION (ALWAYS CHECK):
328
- - Check if DataFrame exists and not empty: if df.empty: answer = "No data available"
329
- - Validate required columns exist: if 'PM2.5' not in df.columns: answer = "Required data not available"
330
- - Check for sufficient data: if len(df) < 10: answer = "Insufficient data for analysis"
331
- - Remove invalid/missing values: df = df.dropna(subset=['PM2.5', 'city', 'Timestamp'])
332
- - Use early exit pattern: if condition: answer = "error message"; else: continue with analysis
333
-
334
- OPERATION SAFETY (PREVENT CRASHES):
335
- - Wrap risky operations in try-except blocks
336
- - Check denominators before division: if denominator == 0: continue
337
- - Validate indexing bounds: if idx >= len(array): continue
338
- - Check for empty results after filtering: if result_df.empty: answer = "No data found"
339
- - Convert data types explicitly: pd.to_numeric(), .astype(int), .astype(str)
340
- - Handle timezone issues with datetime operations
341
- - NO return statements - this is script context, use if/else logic flow
342
-
343
- PLOT GENERATION (MANDATORY FOR PLOTS):
344
- - Check data exists before plotting: if plot_data.empty: answer = "No data to plot"
345
- - Always create new figure: plt.figure(figsize=(12, 8))
346
- - Add comprehensive labels: plt.title(), plt.xlabel(), plt.ylabel()
347
- - Handle long city names: plt.xticks(rotation=45, ha='right')
348
- - Use tight layout: plt.tight_layout()
349
- - CRITICAL PLOT SAVING SEQUENCE (no return statements):
350
- 1. filename = f"plot_{uuid.uuid4().hex[:8]}.png"
351
- 2. plt.savefig(filename, dpi=300, bbox_inches='tight')
352
- 3. plt.close()
353
- 4. answer = filename
354
- - Use if/else logic: if data_valid: create_plot(); answer = filename else: answer = "error"
355
-
356
- CRITICAL CODING PRACTICES:
357
-
358
- DATA VALIDATION & SAFETY:
359
- - Always check if DataFrames/Series are empty before operations: if df.empty: return
360
- - Use .dropna() to handle missing values or .fillna() with appropriate defaults
361
- - Validate column names exist before accessing: if 'column' in df.columns
362
- - Check data types before operations: df['col'].dtype, isinstance() checks
363
- - Handle edge cases: empty results, single row/column DataFrames, all NaN columns
364
- - Use .copy() when modifying DataFrames to avoid SettingWithCopyWarning
365
-
366
- VARIABLE & TYPE HANDLING:
367
- - Use descriptive variable names (avoid single letters in complex operations)
368
- - Ensure all variables are defined before use - initialize with defaults
369
- - Convert pandas/numpy objects to proper Python types before operations
370
- - Convert datetime/period objects appropriately: .astype(str), .dt.strftime(), int()
371
- - Always cast to appropriate types for indexing: int(), str(), list()
372
- - CRITICAL: Convert pandas/numpy values to int before list indexing: int(value) for calendar.month_name[int(month_value)]
373
- - Use explicit type conversions rather than relying on implicit casting
374
-
375
- PANDAS OPERATIONS:
376
- - Reference DataFrame properly: df['column'] not 'column' in operations
377
- - Use .loc/.iloc correctly for indexing - avoid chained indexing
378
- - Use .reset_index() after groupby operations when needed for clean DataFrames
379
- - Sort results for consistent output: .sort_values(), .sort_index()
380
- - Use .round() for numerical results to avoid excessive decimals
381
- - Chain operations carefully - split complex chains for readability
382
-
383
- MATPLOTLIB & PLOTTING:
384
- - Always call plt.close() after saving plots to prevent memory leaks
385
- - Use descriptive titles, axis labels, and legends
386
- - Handle cases where no data exists for plotting
387
- - Use proper figure sizing: plt.figure(figsize=(width, height))
388
- - Convert datetime indices to strings for plotting if needed
389
- - Use color palettes consistently
390
-
391
- ERROR PREVENTION:
392
- - Use try-except blocks for operations that might fail
393
- - Check denominators before division operations
394
- - Validate array/list lengths before indexing
395
- - Use .get() method for dictionary access with defaults
396
- - Handle timezone-aware vs naive datetime objects consistently
397
- - Use proper string formatting and encoding for text output
398
-
399
- TECHNICAL REQUIREMENTS:
400
- - Save final result in variable called 'answer'
401
- - For TEXT: Store the direct answer as a string in 'answer'
402
- - For PLOTS: Save with unique filename f"plot_{{uuid.uuid4().hex[:8]}}.png" and store filename in 'answer'
403
- - For DATAFRAMES: Store the pandas DataFrame directly in 'answer' (e.g., answer = result_df)
404
- - Always use .iloc or .loc properly for pandas indexing
405
- - Close matplotlib figures with plt.close() to prevent memory leaks
406
- - Use proper column name checks before accessing columns
407
- - For dataframes, ensure proper column names and sorting for readability
408
- """
409
-
410
- query = f"""{system_prompt}
411
-
412
- Complete the following code to answer the user's question:
413
-
414
- {template}
415
- """
416
-
417
- # Make API call
418
- if model_name == "gemini-pro":
419
- response = llm.invoke(query)
420
- answer = response.content
421
- else:
422
- response = llm.invoke(query)
423
- answer = response.content
424
-
425
- # Extract and execute code with enhanced error handling
426
- try:
427
- if "```python" in answer:
428
- code_part = answer.split("```python")[1].split("```")[0]
429
- else:
430
- code_part = answer
431
 
432
- full_code = f"""
 
 
 
 
433
  {template.split("```python")[1].split("```")[0]}
434
  {code_part}
435
  """
436
-
437
- # Execute code in a controlled environment with better error handling
438
- local_vars = {}
439
- global_vars = {
440
- 'pd': pd,
441
- 'plt': plt,
442
- 'os': os,
443
- 'uuid': __import__('uuid'),
444
- 'calendar': __import__('calendar'),
445
- 'np': __import__('numpy')
446
- }
447
-
448
- exec(full_code, global_vars, local_vars)
449
-
450
- # Get the answer
451
- if 'answer' in local_vars:
452
- answer_result = local_vars['answer']
453
- else:
454
- answer_result = "Code executed but no result was saved in 'answer' variable"
455
-
456
- execution_time = (datetime.now() - start_time).total_seconds()
457
-
458
- # Determine if output is an image
459
- is_image = isinstance(answer_result, str) and any(answer_result.endswith(ext) for ext in ['.png', '.jpg', '.jpeg'])
460
-
461
- # Log successful interaction
462
- log_interaction(
463
- user_query=question,
464
- model_name=model_name,
465
- response_content=str(answer_result),
466
- generated_code=full_code,
467
- execution_time=execution_time,
468
- error_message=None,
469
- is_image=is_image
470
- )
471
-
472
- return {
473
- "role": "assistant",
474
- "content": answer_result,
475
- "gen_code": full_code,
476
- "ex_code": full_code,
477
- "last_prompt": question,
478
- "error": None
479
- }
480
-
481
- except Exception as code_error:
482
- execution_time = (datetime.now() - start_time).total_seconds()
483
- error_msg = str(code_error)
484
-
485
- # Classify and provide user-friendly error messages
486
- user_friendly_msg = "I encountered an error while analyzing your data. "
487
-
488
- if "unmatched" in error_msg.lower() or "invalid syntax" in error_msg.lower():
489
- user_friendly_msg += "There was a syntax error in the generated code (missing brackets or quotes). Please try rephrasing your question or try again."
490
- elif "not defined" in error_msg.lower():
491
- user_friendly_msg += "There was a variable naming error in the generated code. Please try asking the question again."
492
- elif "has no attribute" in error_msg.lower():
493
- user_friendly_msg += "There was an issue accessing data properties. Please try a simpler version of your question."
494
- elif "division by zero" in error_msg.lower():
495
- user_friendly_msg += "The calculation involved division by zero, possibly due to missing data. Please try a different time period or location."
496
- elif "empty" in error_msg.lower() or "no data" in error_msg.lower():
497
- user_friendly_msg += "No relevant data was found for your query. Please try adjusting the time period, location, or criteria."
498
- else:
499
- user_friendly_msg += f"Technical error: {error_msg}"
500
-
501
- user_friendly_msg += "\n\n💡 **Suggestions:**\n- Try rephrasing your question\n- Use simpler terms\n- Check if the data exists for your specified criteria"
502
-
503
- # Log the failed code execution
504
- log_interaction(
505
- user_query=question,
506
- model_name=model_name,
507
- response_content=user_friendly_msg,
508
- generated_code=full_code if 'full_code' in locals() else "",
509
- execution_time=execution_time,
510
- error_message=error_msg,
511
- is_image=False
512
- )
513
-
514
- return {
515
- "role": "assistant",
516
- "content": user_friendly_msg,
517
- "gen_code": full_code if 'full_code' in locals() else "",
518
- "ex_code": full_code if 'full_code' in locals() else "",
519
- "last_prompt": question,
520
- "error": error_msg
521
- }
522
-
523
- except Exception as e:
524
- execution_time = (datetime.now() - start_time).total_seconds()
525
- error_msg = str(e)
526
-
527
- # Handle specific API errors
528
- if "organization_restricted" in error_msg:
529
- response_content = "API Organization Restricted: Your API key access has been restricted. Please check your Groq API key or try generating a new one."
530
- log_error_msg = "API access restricted"
531
- elif "rate_limit" in error_msg.lower():
532
- response_content = "Rate limit exceeded. Please wait a moment and try again."
533
- log_error_msg = "Rate limit exceeded"
534
  else:
535
- response_content = f"Error: {error_msg}"
536
- log_error_msg = error_msg
537
-
538
- # Log the failed interaction
539
  log_interaction(
540
  user_query=question,
541
  model_name=model_name,
542
- response_content=response_content,
543
- generated_code="",
544
  execution_time=execution_time,
545
- error_message=log_error_msg,
546
  is_image=False
547
  )
548
-
549
  return {
550
- "role": "assistant",
551
- "content": response_content,
552
- "gen_code": "",
553
- "ex_code": "",
554
  "last_prompt": question,
555
- "error": log_error_msg
556
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  gemini_token = os.getenv("GEMINI_TOKEN")
21
 
22
  # Debug print (remove in production)
23
+ # print(f"Debug - Groq Token: {'Present' if Groq_Token else 'Missing'}")
24
+ # print(f"Debug - Groq Token Value: {Groq_Token[:10] + '...' if Groq_Token else 'None'}")
25
+ # print(f"Debug - Gemini Token: {'Present' if gemini_token else 'Missing'}")
26
 
27
  models = {
 
28
  "gpt-oss-120b": "openai/gpt-oss-120b",
29
+ "gpt-oss-20b": "openai/gpt-oss-20b",
30
+ "llama4 maverik":"meta-llama/llama-4-maverick-17b-128e-instruct",
31
  "llama3.3": "llama-3.3-70b-versatile",
32
  "deepseek-R1": "deepseek-r1-distill-llama-70b",
33
+ "gemini-2.5-flash": "gemini-2.5-flash",
34
+ "gemini-2.5-pro": "gemini-2.5-pro",
35
+ "gemini-2.5-flash-lite": "gemini-2.5-flash-lite",
36
+ "gemini-2.0-flash": "gemini-2.0-flash",
37
+ "gemini-2.0-flash-lite": "gemini-2.0-flash-lite",
38
+ # "llama4 scout":"meta-llama/llama-4-scout-17b-16e-instruct"
39
+ # "llama3.1": "llama-3.1-8b-instant"
40
  }
41
 
42
  def log_interaction(user_query, model_name, response_content, generated_code, execution_time, error_message=None, is_image=False):
 
100
  raise Exception(f"Error loading dataframe: {e}")
101
 
102
 
 
103
  def get_from_user(prompt):
104
  """Format user prompt"""
105
  return {"role": "user", "content": prompt}
106
 
107
 
 
 
108
  def ask_question(model_name, question):
109
  """Ask question with comprehensive error handling and logging"""
110
  start_time = datetime.now()
111
+ # ------------------------
112
+ # Helper functions
113
+ # ------------------------
114
+ def make_error_response(msg, log_msg, content=None):
115
+ """Build error response + log it"""
116
+ execution_time = (datetime.now() - start_time).total_seconds()
117
+ log_interaction(
118
+ user_query=question,
119
+ model_name=model_name,
120
+ response_content=content or msg,
121
+ generated_code="",
122
+ execution_time=execution_time,
123
+ error_message=log_msg,
124
+ is_image=False
125
+ )
126
+ return {
127
+ "role": "assistant",
128
+ "content": content or msg,
129
+ "gen_code": "",
130
+ "ex_code": "",
131
+ "last_prompt": question,
132
+ "error": log_msg
133
+ }
134
+ def validate_api_token(token, token_name, msg_if_missing):
135
+ """Check for missing/empty API tokens"""
136
+ if not token or token.strip() == "":
137
+ return make_error_response(
138
+ msg="Missing or empty API token",
139
+ log_msg="Missing or empty API token",
140
+ content=msg_if_missing
141
+ )
142
+ return None # OK
143
+ def run_safe_exec(full_code, df=None, extra_globals=None):
144
+ """Safely execute generated code and handle errors"""
145
+ local_vars = {}
146
+ global_vars = {
147
+ 'pd': pd, 'plt': plt, 'os': os,
148
+ 'uuid': __import__('uuid'),
149
+ 'calendar': __import__('calendar'),
150
+ 'np': __import__('numpy'),
151
+ 'df': df # <-- pass your DataFrame here
152
+ }
153
+
154
+ # allow user to inject more globals (optional)
155
+ if extra_globals:
156
+ global_vars.update(extra_globals)
157
+
158
+ try:
159
+ exec(full_code, global_vars, local_vars)
160
+ return (
161
+ local_vars.get('answer', "Code executed but no result was saved in 'answer' variable"),
162
+ None
163
+ )
164
+ except Exception as code_error:
165
+ return None, str(code_error)
166
+
167
+ # ------------------------
168
+ # Step 1: Reload env vars
169
+ # ------------------------
170
+ load_dotenv(override=True)
171
+ fresh_groq_token = os.getenv("GROQ_API_KEY")
172
+ fresh_gemini_token = os.getenv("GEMINI_TOKEN")
173
+ # ------------------------
174
+ # Step 2: Init LLM
175
+ # ------------------------
176
  try:
177
+ if "gemini" in model_name:
178
+ token_error = validate_api_token(
179
+ fresh_gemini_token,
180
+ "GEMINI_TOKEN",
181
+ "Gemini API token not available or empty. Please set GEMINI_TOKEN in your environment variable."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  )
183
+ if token_error:
184
+ return token_error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
 
186
  try:
187
+ llm = ChatGoogleGenerativeAI(
188
+ model=models[model_name],
189
+ google_api_key=fresh_gemini_token,
190
+ temperature=0
191
  )
192
+ # Gemini requires async call
193
+ llm.invoke("Test")
194
+ # print("Gemini API key test successful")
195
  except Exception as api_error:
196
+ return make_error_response(
197
+ msg="API Connection Error",
198
+ log_msg=str(api_error),
199
+ content="API Key Error: Your Gemini API key appears to be invalid, expired, or restricted. Please check your GEMINI_TOKEN in the .env file."
200
+ if "organization_restricted"in str(api_error).lower() or "unauthorized" in str(api_error).lower()
201
+ else f"API Connection Error: {api_error}"
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  )
 
 
 
 
 
 
 
 
 
203
 
204
+ else:
205
+ token_error = validate_api_token(
206
+ fresh_groq_token,
207
+ "GROQ_API_KEY",
208
+ "Groq API token not available or empty. Please set GROQ_API_KEY in your environment variables and restart the application."
 
 
 
 
 
 
 
 
 
209
  )
210
+ if token_error:
211
+ return token_error
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+ try:
214
+ llm = ChatGroq(
215
+ model=models[model_name],
216
+ api_key=fresh_groq_token,
217
+ temperature=0
218
+ )
219
+ llm.invoke("Test") # test API key
220
+ # print("Groq API key test successful")
221
+ except Exception as api_error:
222
+ return make_error_response(
223
+ msg="API Connection Error",
224
+ log_msg=str(api_error),
225
+ content="API Key Error: Your Groq API key appears to be invalid, expired, or restricted. Please check your GROQ_API_KEY in the .env file."
226
+ if "organization_restricted"in str(api_error).lower() or "unauthorized" in str(api_error).lower()
227
+ else f"API Connection Error: {api_error}"
228
+ )
229
+ except Exception as e:
230
+ return make_error_response(str(e), str(e))
231
+ # ------------------------
232
+ # Step 3: Check AQ_met_data.csv
233
+ # ------------------------
234
+ if not os.path.exists("AQ_met_data.csv"):
235
+ return make_error_response(
236
+ msg="Data file not found",
237
+ log_msg="Data file not found",
238
+ content="AQ_met_data.csv file not found. Please ensure the data file is in the correct location."
239
+ )
240
 
241
+ df = pd.read_csv("AQ_met_data.csv")
242
+ df["Timestamp"] = pd.to_datetime(df["Timestamp"])
243
+ new_line = "\n"
244
+ states_df = pd.read_csv("states_data.csv")
245
+ ncap_df = pd.read_csv("ncap_funding_data.csv")
246
+
247
+ # Template for user query
248
+ template = f"""```python
249
  import pandas as pd
250
  import matplotlib.pyplot as plt
251
  import uuid
252
  import calendar
253
  import numpy as np
 
254
  # Set professional matplotlib styling
255
  plt.rcParams.update({{
256
  'font.size': 12,
 
274
  'figure.figsize': [12, 6],
275
  'axes.prop_cycle': plt.cycler('color', ['#3b82f6', '#ef4444', '#10b981', '#f59e0b', '#8b5cf6', '#06b6d4'])
276
  }})
277
+ df = pd.read_csv("AQ_met_data.csv")
 
278
  df["Timestamp"] = pd.to_datetime(df["Timestamp"])
279
+ states_df = pd.read_csv("states_data.csv")
280
+ ncap_df = pd.read_csv("ncap_funding_data.csv")
281
+ # df is pandas DataFrame with air quality data from India. Data frequency is daily from 2017 to 2024. The data has the following columns and data types:
282
+ {new_line.join(map(lambda x: '# '+x, str(df.dtypes).split(new_line)))}
283
+ # states_df is a pandas DataFrame of state-wise population, area and whether state is union territory or not of India.
284
+ {new_line.join(map(lambda x: '# '+x, str(states_df.dtypes).split(new_line)))}
285
+ # ncap_df is a pandas DataFrame of funding given to the cities of India from 2019-2022, under The National Clean Air Program (NCAP).
286
+ {new_line.join(map(lambda x: '# '+x, str(ncap_df.dtypes).split(new_line)))}
287
  # Question: {question.strip()}
288
  # Generate code to answer the question and save result in 'answer' variable
289
  # If creating a plot, save it with a unique filename and store the filename in 'answer'
290
  # If returning text/numbers, store the result directly in 'answer'
291
  ```"""
292
 
293
+ # Read system prompt from txt file
294
+ with open("new_system_prompt.txt", "r", encoding="utf-8") as f:
295
+ system_prompt = f.read().strip()
296
+
297
+ messages = [
298
+ {
299
+ "role": "system",
300
+ "content": system_prompt
301
+ },
302
+ {
303
+ "role": "user",
304
+ "content": f"""Complete the following code to answer the user's question:
305
+ {template}"""
306
+ }
307
+ ]
308
+
309
+ # ------------------------
310
+ # Step 4: Call model
311
+ # ------------------------
312
+ try:
313
+ response = llm.invoke(messages)
314
+ answer = response.content
315
+ except Exception as e:
316
+ return make_error_response(f"Error: {e}", str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
+ # ------------------------
319
+ # Step 5: Extract code
320
+ # ------------------------
321
+ code_part = answer.split("```python")[1].split("```")[0] if "```python" in answer else answer
322
+ full_code = f"""
323
  {template.split("```python")[1].split("```")[0]}
324
  {code_part}
325
  """
326
+ answer_result, code_error = run_safe_exec(full_code, df, extra_globals={'states_df': states_df, 'ncap_df': ncap_df})
327
+
328
+ execution_time = (datetime.now() - start_time).total_seconds()
329
+ if code_error:
330
+ # Friendly error messages
331
+ msg = "I encountered an error while analyzing your data. "
332
+ if "syntax" in code_error.lower():
333
+ msg += "There was a syntax error in the generated code. Please try rephrasing your question."
334
+ elif "not defined" in code_error.lower():
335
+ msg += "Variable naming error occurred. Please try asking the question again."
336
+ elif "division by zero" in code_error.lower():
337
+ msg += "Calculation involved division by zero, possibly due to missing data."
338
+ elif "no data" in code_error.lower() or "empty" in code_error.lower():
339
+ msg += "No relevant data was found for your query."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  else:
341
+ msg += f"Technical error: {code_error}"
342
+
343
+ msg += "\n\n💡 **Suggestions:**\n- Try rephrasing your question\n- Use simpler terms\n- Check if the data exists for your specified criteria"
344
+
345
  log_interaction(
346
  user_query=question,
347
  model_name=model_name,
348
+ response_content=msg,
349
+ generated_code=full_code,
350
  execution_time=execution_time,
351
+ error_message=code_error,
352
  is_image=False
353
  )
 
354
  return {
355
+ "role": "assistant",
356
+ "content": msg,
357
+ "gen_code": full_code,
358
+ "ex_code": full_code,
359
  "last_prompt": question,
360
+ "error": code_error
361
+ }
362
+
363
+ # ------------------------
364
+ # Step 7: Success logging
365
+ # ------------------------
366
+ is_image = isinstance(answer_result, str) and answer_result.endswith(('.png', '.jpg', '.jpeg'))
367
+ log_interaction(
368
+ user_query=question,
369
+ model_name=model_name,
370
+ response_content=str(answer_result),
371
+ generated_code=full_code,
372
+ execution_time=execution_time,
373
+ error_message=None,
374
+ is_image=is_image
375
+ )
376
+
377
+ return {
378
+ "role": "assistant",
379
+ "content": answer_result,
380
+ "gen_code": full_code,
381
+ "ex_code": full_code,
382
+ "last_prompt": question,
383
+ "error": None
384
+ }