agent-course-gaia

Sleeping

File size: 5,789 Bytes

e77bace

import pandas as pd
from smolagents import tool
from typing import Union, Optional


@tool
def get_csv_data_summary(file_path: str) -> str:
    """
    Reads a CSV file from the given file path and returns a summary of its content.
    The summary includes the number of rows and columns, column names, and basic descriptive statistics.

    Args:
        file_path (str): The absolute local path to the CSV file. 
                         This path should be obtained from the 'File Information' section if the file was downloaded by the agent.

    Returns:
        str: A string containing the data summary (shape, columns, descriptive statistics) or an error message if processing fails.
    """
    try:
        df = pd.read_csv(file_path)

        summary = f"Successfully read CSV file: '{file_path}'\n"
        summary += f"Number of rows: {len(df)}\n"
        summary += f"Number of columns: {len(df.columns)}\n"
        summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
        summary += "Descriptive statistics:\n"
        # include='all' for mixed types
        summary += df.describe(include='all').to_string()

        # For very wide dataframes, head might be more useful than full describe in limited contexts
        # if len(df.columns) > 15:
        #     summary += "\n\nFirst 5 rows (due to large number of columns):\n"
        #     summary += df.head().to_string()

        return summary
    except ImportError:
        return "Error: The 'pandas' library is required but not installed. Please ensure it is available in the agent's environment."
    except FileNotFoundError:
        return f"Error: The CSV file was not found at the specified path: '{file_path}'. Please verify the path."
    except pd.errors.EmptyDataError:
        return f"Error: The CSV file at '{file_path}' is empty."
    except Exception as e:
        return f"Error processing CSV file '{file_path}': {type(e).__name__} - {str(e)}"


@tool
def get_excel_data_summary(file_path: str, sheet_name: Optional[str] = "0") -> str:
    """
    Reads an Excel file (supports .xls and .xlsx) from the given file path and returns a summary of the specified sheet's content.
    The summary includes the number of rows and columns, column names, and basic descriptive statistics for that sheet.

    Args:
        file_path (str): The absolute local path to the Excel file.
                         This path should be obtained from the 'File Information' section if the file was downloaded by the agent.
        sheet_name (str | int | None, optional): The name of the sheet to read (e.g., "Sheet1") or its 0-indexed position (e.g., 0).
                                                 If None or 0, the first sheet is read. Defaults to 0 (the first sheet).

    Returns:
        str: A string containing the data summary from the specified sheet (shape, columns, descriptive statistics) or an error message.
    """
    try:
        # Determine engine based on file extension for clearer error messages if engine is missing
        engine = None
        if file_path.endswith('.xlsx'):
            engine = 'openpyxl'
        elif file_path.endswith('.xls'):
            # or 'openpyxl' if xlrd is not available and openpyxl can handle it.
            engine = 'xlrd'

        actual_sheet_name_for_pandas: Union[str, int, None]
        if sheet_name is None:
            actual_sheet_name_for_pandas = 0  # Default to first sheet
        elif sheet_name.isdigit():
            actual_sheet_name_for_pandas = int(sheet_name)
        else:
            actual_sheet_name_for_pandas = sheet_name

        df = pd.read_excel(
            file_path, sheet_name=actual_sheet_name_for_pandas, engine=engine)

        sheet_identifier = f"sheet '{sheet_name}'" if sheet_name is not None else "the first sheet"
        summary = f"Successfully read {sheet_identifier} from Excel file: '{file_path}'\n"
        summary += f"Number of rows: {len(df)}\n"
        summary += f"Number of columns: {len(df.columns)}\n"
        summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
        summary += "Descriptive statistics:\n"
        summary += df.describe(include='all').to_string()

        return summary
    except ImportError:
        return ("Error: The 'pandas' library and an Excel engine ('openpyxl' for .xlsx, 'xlrd' for .xls) "
                "are required. Please ensure they are available in the agent's environment.")
    except FileNotFoundError:
        return f"Error: The Excel file was not found at the specified path: '{file_path}'. Please verify the path."
    except pd.errors.EmptyDataError:  # Though less common for Excel sheets than CSVs
        return f"Error: The specified sheet in Excel file '{file_path}' is empty or could not be parsed as data."
    except ValueError as ve:  # Catches incorrect sheet names/indices from pandas
        if "sheet_name" in str(ve).lower():
            return f"Error: Sheet '{sheet_name}' not found in Excel file '{file_path}'. Please check the sheet name or index."
        return f"Error processing Excel file '{file_path}': ValueError - {str(ve)}"
    except Exception as e:
        # Specific check for missing engines, as pandas might raise a general Exception or ValueError
        err_str = str(e).lower()
        if "openpyxl" in err_str and "install openpyxl" in err_str:
            return f"Error: Missing 'openpyxl' engine for Excel file '{file_path}'. Please install it."
        if "xlrd" in err_str and ("install xlrd" in err_str or "support for .xls files" in err_str):
            return f"Error: Missing 'xlrd' engine for .xls Excel file '{file_path}'. Please install it or try 'openpyxl' if compatible."
        return f"Error processing Excel file '{file_path}': {type(e).__name__} - {str(e)}"