Spaces:
Sleeping
Sleeping
support xls,csv files
Browse files- basic_agent.py +24 -22
- file_processing_tools.py +108 -0
- requirements.txt +4 -1
basic_agent.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
# This assumes 'file_handler.py' is in the same directory or accessible via PYTHONPATH
|
| 3 |
from file_handler import get_task_file_path, DEFAULT_FILES_DIR
|
| 4 |
from youtube_tool import get_youtube_video_transcript
|
|
|
|
| 5 |
|
| 6 |
from smolagents import (
|
| 7 |
ToolCallingAgent,
|
|
@@ -74,11 +75,13 @@ class BasicAgent:
|
|
| 74 |
|
| 75 |
agent_tools = [
|
| 76 |
get_youtube_video_transcript,
|
|
|
|
|
|
|
| 77 |
DuckDuckGoSearchTool(),
|
| 78 |
VisitWebpageTool(),
|
| 79 |
WikipediaSearchTool(),
|
| 80 |
PythonInterpreterTool(authorized_imports=[
|
| 81 |
-
'statistics', 'unicodedata', 'collections', 'queue', 'time', 'pandas',
|
| 82 |
'stat', 'random', 'datetime', 're', 'math', 'itertools', 'os', 'sys',
|
| 83 |
'io', 'csv', 'json', 'pathlib', 'subprocess', 'base64'
|
| 84 |
])
|
|
@@ -87,7 +90,7 @@ class BasicAgent:
|
|
| 87 |
agent = ToolCallingAgent(
|
| 88 |
tools=agent_tools,
|
| 89 |
model=model,
|
| 90 |
-
max_steps=
|
| 91 |
name="TaskProcessorAgent",
|
| 92 |
description="An agent designed to answer questions by searching the web, processing local files (if a path is provided in 'File Information'), and executing Python code.",
|
| 93 |
verbosity_level=2
|
|
@@ -96,31 +99,30 @@ class BasicAgent:
|
|
| 96 |
# Simplified prompt template
|
| 97 |
prompt_template = f"""
|
| 98 |
Your primary goal is to accurately and concisely answer the provided question using your available tools and any supplied information.
|
|
|
|
| 99 |
|
| 100 |
Key Instructions:
|
| 101 |
-
1. **Understand the Task**: Carefully read the entire question.
|
| 102 |
2. **Strategize and Select Tools**:
|
| 103 |
-
* Choose the most appropriate tool(s) based on the question, file details, and
|
| 104 |
-
* For web research,
|
| 105 |
-
*
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
* For general calculations, data analysis not involving a mentioned local file, or other Python tasks,
|
| 109 |
-
3. **Information
|
| 110 |
-
* When
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
4. **Handling Tool Errors**:
|
| 115 |
-
* If a tool call
|
| 116 |
-
a. **Analyze the error message.**
|
| 117 |
-
b. **Do not immediately retry the exact same tool call with the exact same arguments.**
|
| 118 |
-
c. Instead, try to **correct the input to the tool** (e.g., fix the Python code, verify a URL or file path).
|
| 119 |
-
d. If correction isn't obvious or fails, consider if a different tool or a different approach is more suitable.
|
| 120 |
5. **Formulate Your Response**:
|
| 121 |
-
* Provide only the final, concise answer
|
| 122 |
-
*
|
| 123 |
-
* If, after a thorough investigation and attempts to use tools effectively (including handling errors), you cannot determine a definitive answer, respond with the exact phrase 'I don't know'.
|
| 124 |
|
| 125 |
--- Start of Question & File Information ---
|
| 126 |
{question_to_llm}
|
|
|
|
| 2 |
# This assumes 'file_handler.py' is in the same directory or accessible via PYTHONPATH
|
| 3 |
from file_handler import get_task_file_path, DEFAULT_FILES_DIR
|
| 4 |
from youtube_tool import get_youtube_video_transcript
|
| 5 |
+
from file_processing_tools import get_csv_data_summary, get_excel_data_summary
|
| 6 |
|
| 7 |
from smolagents import (
|
| 8 |
ToolCallingAgent,
|
|
|
|
| 75 |
|
| 76 |
agent_tools = [
|
| 77 |
get_youtube_video_transcript,
|
| 78 |
+
get_csv_data_summary,
|
| 79 |
+
get_excel_data_summary,
|
| 80 |
DuckDuckGoSearchTool(),
|
| 81 |
VisitWebpageTool(),
|
| 82 |
WikipediaSearchTool(),
|
| 83 |
PythonInterpreterTool(authorized_imports=[
|
| 84 |
+
'statistics', 'unicodedata', 'collections', 'queue', 'time', 'pandas', 'openpyxl',
|
| 85 |
'stat', 'random', 'datetime', 're', 'math', 'itertools', 'os', 'sys',
|
| 86 |
'io', 'csv', 'json', 'pathlib', 'subprocess', 'base64'
|
| 87 |
])
|
|
|
|
| 90 |
agent = ToolCallingAgent(
|
| 91 |
tools=agent_tools,
|
| 92 |
model=model,
|
| 93 |
+
max_steps=4,
|
| 94 |
name="TaskProcessorAgent",
|
| 95 |
description="An agent designed to answer questions by searching the web, processing local files (if a path is provided in 'File Information'), and executing Python code.",
|
| 96 |
verbosity_level=2
|
|
|
|
| 99 |
# Simplified prompt template
|
| 100 |
prompt_template = f"""
|
| 101 |
Your primary goal is to accurately and concisely answer the provided question using your available tools and any supplied information.
|
| 102 |
+
The framework will provide you with a list of available tools and their descriptions.
|
| 103 |
|
| 104 |
Key Instructions:
|
| 105 |
+
1. **Understand the Task**: Carefully read the entire question. Note any URLs, specific phrases to find, or file information (check the "File Information" section provided below the question; it states if a local file is available and its path).
|
| 106 |
2. **Strategize and Select Tools**:
|
| 107 |
+
* Choose the most appropriate tool(s) for the task based on the question, any file details, URLs provided, and the capabilities of your available tools.
|
| 108 |
+
* For general web research or finding information/URLs, consider tools like `web_search` or `wikipedia_search`.
|
| 109 |
+
* To get content from specific web page URLs, use a tool like `visit_webpage`.
|
| 110 |
+
* **For YouTube videos**: If the question is about the content of a YouTube video (e.g., "Examine the video at https://www.youtube.com/..."), consider using the `get_youtube_video_transcript` tool with the video URL or ID to get its transcript. **Once a transcript is retrieved, your primary focus should be to analyze this transcript to answer the question.**
|
| 111 |
+
* **Working with Local Files**: If the "File Information" section indicates a local file path is available *and that file's content is needed to answer the question*, the `PythonInterpreterTool` is the primary tool to access this local file. Generate Python code for this tool to open the file (using its full, exact local path from "File Information"), then read, process, or execute its content as required.
|
| 112 |
+
* For general calculations, data analysis not involving a mentioned local file, or other Python tasks, the `PythonInterpreterTool` is appropriate.
|
| 113 |
+
3. **Information Processing and Answer Extraction**:
|
| 114 |
+
* **Direct Tool Output First**: When a tool (like `get_youtube_video_transcript`, `visit_webpage`, or `PythonInterpreterTool` reading a file) provides specific content (e.g., a transcript, web page text, file content):
|
| 115 |
+
a. **Thoroughly analyze THIS content first to find the answer.**
|
| 116 |
+
b. If the question asks for specific dialogue or phrases (e.g., "What does X say in response to Y?"), search for the quoted phrases (Y) in the content and identify the subsequent statement (X's response).
|
| 117 |
+
c. Only consider using other tools (like `web_search`) to re-verify or find the same piece of information if the direct output is clearly nonsensical, an obvious error, or explicitly states it's incomplete for the *specific question asked*. Do not use web_search just because the formatting of the direct output isn't perfect or seems slightly unclear at first glance if the data is present.
|
| 118 |
+
* **Critical Evaluation**: Pay attention to details like speaker identification (if inferable), names, dates, roles (e.g., distinguish between a nominator and a promoter), and exact phrasing.
|
| 119 |
+
* **Synthesize if Necessary**: Combine information from multiple *different* pieces of evidence if the question requires it.
|
| 120 |
+
* **Fact-Based Answers**: Base your final answer *only* on confirmed facts from the information gathered.
|
| 121 |
4. **Handling Tool Errors**:
|
| 122 |
+
* If a tool call itself returns an error: analyze the error message, try to correct the input to the tool (e.g., fix code, verify URL). Do not immediately retry the exact same call. Consider if a different tool or approach is more suitable.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
5. **Formulate Your Response**:
|
| 124 |
+
* Provide only the final, concise answer. Do not include reasoning, apologies, or conversational filler.
|
| 125 |
+
* If, after thorough investigation (including careful analysis of direct tool outputs and appropriate error handling), you cannot determine a definitive answer, respond with 'I don't know'.
|
|
|
|
| 126 |
|
| 127 |
--- Start of Question & File Information ---
|
| 128 |
{question_to_llm}
|
file_processing_tools.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from smolagents import tool
|
| 3 |
+
from typing import Union, Optional
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@tool
|
| 7 |
+
def get_csv_data_summary(file_path: str) -> str:
|
| 8 |
+
"""
|
| 9 |
+
Reads a CSV file from the given file path and returns a summary of its content.
|
| 10 |
+
The summary includes the number of rows and columns, column names, and basic descriptive statistics.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
file_path (str): The absolute local path to the CSV file.
|
| 14 |
+
This path should be obtained from the 'File Information' section if the file was downloaded by the agent.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
str: A string containing the data summary (shape, columns, descriptive statistics) or an error message if processing fails.
|
| 18 |
+
"""
|
| 19 |
+
try:
|
| 20 |
+
df = pd.read_csv(file_path)
|
| 21 |
+
|
| 22 |
+
summary = f"Successfully read CSV file: '{file_path}'\n"
|
| 23 |
+
summary += f"Number of rows: {len(df)}\n"
|
| 24 |
+
summary += f"Number of columns: {len(df.columns)}\n"
|
| 25 |
+
summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
|
| 26 |
+
summary += "Descriptive statistics:\n"
|
| 27 |
+
# include='all' for mixed types
|
| 28 |
+
summary += df.describe(include='all').to_string()
|
| 29 |
+
|
| 30 |
+
# For very wide dataframes, head might be more useful than full describe in limited contexts
|
| 31 |
+
# if len(df.columns) > 15:
|
| 32 |
+
# summary += "\n\nFirst 5 rows (due to large number of columns):\n"
|
| 33 |
+
# summary += df.head().to_string()
|
| 34 |
+
|
| 35 |
+
return summary
|
| 36 |
+
except ImportError:
|
| 37 |
+
return "Error: The 'pandas' library is required but not installed. Please ensure it is available in the agent's environment."
|
| 38 |
+
except FileNotFoundError:
|
| 39 |
+
return f"Error: The CSV file was not found at the specified path: '{file_path}'. Please verify the path."
|
| 40 |
+
except pd.errors.EmptyDataError:
|
| 41 |
+
return f"Error: The CSV file at '{file_path}' is empty."
|
| 42 |
+
except Exception as e:
|
| 43 |
+
return f"Error processing CSV file '{file_path}': {type(e).__name__} - {str(e)}"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@tool
|
| 47 |
+
def get_excel_data_summary(file_path: str, sheet_name: Optional[str] = "0") -> str:
|
| 48 |
+
"""
|
| 49 |
+
Reads an Excel file (supports .xls and .xlsx) from the given file path and returns a summary of the specified sheet's content.
|
| 50 |
+
The summary includes the number of rows and columns, column names, and basic descriptive statistics for that sheet.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
file_path (str): The absolute local path to the Excel file.
|
| 54 |
+
This path should be obtained from the 'File Information' section if the file was downloaded by the agent.
|
| 55 |
+
sheet_name (str | int | None, optional): The name of the sheet to read (e.g., "Sheet1") or its 0-indexed position (e.g., 0).
|
| 56 |
+
If None or 0, the first sheet is read. Defaults to 0 (the first sheet).
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
str: A string containing the data summary from the specified sheet (shape, columns, descriptive statistics) or an error message.
|
| 60 |
+
"""
|
| 61 |
+
try:
|
| 62 |
+
# Determine engine based on file extension for clearer error messages if engine is missing
|
| 63 |
+
engine = None
|
| 64 |
+
if file_path.endswith('.xlsx'):
|
| 65 |
+
engine = 'openpyxl'
|
| 66 |
+
elif file_path.endswith('.xls'):
|
| 67 |
+
# or 'openpyxl' if xlrd is not available and openpyxl can handle it.
|
| 68 |
+
engine = 'xlrd'
|
| 69 |
+
|
| 70 |
+
actual_sheet_name_for_pandas: Union[str, int, None]
|
| 71 |
+
if sheet_name is None:
|
| 72 |
+
actual_sheet_name_for_pandas = 0 # Default to first sheet
|
| 73 |
+
elif sheet_name.isdigit():
|
| 74 |
+
actual_sheet_name_for_pandas = int(sheet_name)
|
| 75 |
+
else:
|
| 76 |
+
actual_sheet_name_for_pandas = sheet_name
|
| 77 |
+
|
| 78 |
+
df = pd.read_excel(
|
| 79 |
+
file_path, sheet_name=actual_sheet_name_for_pandas, engine=engine)
|
| 80 |
+
|
| 81 |
+
sheet_identifier = f"sheet '{sheet_name}'" if sheet_name is not None else "the first sheet"
|
| 82 |
+
summary = f"Successfully read {sheet_identifier} from Excel file: '{file_path}'\n"
|
| 83 |
+
summary += f"Number of rows: {len(df)}\n"
|
| 84 |
+
summary += f"Number of columns: {len(df.columns)}\n"
|
| 85 |
+
summary += f"Column names: {', '.join(df.columns.astype(str))}\n\n"
|
| 86 |
+
summary += "Descriptive statistics:\n"
|
| 87 |
+
summary += df.describe(include='all').to_string()
|
| 88 |
+
|
| 89 |
+
return summary
|
| 90 |
+
except ImportError:
|
| 91 |
+
return ("Error: The 'pandas' library and an Excel engine ('openpyxl' for .xlsx, 'xlrd' for .xls) "
|
| 92 |
+
"are required. Please ensure they are available in the agent's environment.")
|
| 93 |
+
except FileNotFoundError:
|
| 94 |
+
return f"Error: The Excel file was not found at the specified path: '{file_path}'. Please verify the path."
|
| 95 |
+
except pd.errors.EmptyDataError: # Though less common for Excel sheets than CSVs
|
| 96 |
+
return f"Error: The specified sheet in Excel file '{file_path}' is empty or could not be parsed as data."
|
| 97 |
+
except ValueError as ve: # Catches incorrect sheet names/indices from pandas
|
| 98 |
+
if "sheet_name" in str(ve).lower():
|
| 99 |
+
return f"Error: Sheet '{sheet_name}' not found in Excel file '{file_path}'. Please check the sheet name or index."
|
| 100 |
+
return f"Error processing Excel file '{file_path}': ValueError - {str(ve)}"
|
| 101 |
+
except Exception as e:
|
| 102 |
+
# Specific check for missing engines, as pandas might raise a general Exception or ValueError
|
| 103 |
+
err_str = str(e).lower()
|
| 104 |
+
if "openpyxl" in err_str and "install openpyxl" in err_str:
|
| 105 |
+
return f"Error: Missing 'openpyxl' engine for Excel file '{file_path}'. Please install it."
|
| 106 |
+
if "xlrd" in err_str and ("install xlrd" in err_str or "support for .xls files" in err_str):
|
| 107 |
+
return f"Error: Missing 'xlrd' engine for .xls Excel file '{file_path}'. Please install it or try 'openpyxl' if compatible."
|
| 108 |
+
return f"Error processing Excel file '{file_path}': {type(e).__name__} - {str(e)}"
|
requirements.txt
CHANGED
|
@@ -4,4 +4,7 @@ smolagents
|
|
| 4 |
litellm
|
| 5 |
duckduckgo-search
|
| 6 |
wikipedia-api
|
| 7 |
-
youtube-transcript-api
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
litellm
|
| 5 |
duckduckgo-search
|
| 6 |
wikipedia-api
|
| 7 |
+
youtube-transcript-api
|
| 8 |
+
pandas
|
| 9 |
+
openpyxl
|
| 10 |
+
markdownify
|