Smol2Operator / app.py
A-Mahla's picture
A-Mahla HF Staff
Update app.py (#1)
57a1dfe verified
raw
history blame
9.74 kB
import spaces
import re
from typing import Tuple, Optional
import gradio as gr
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from smolvlm_inference import TransformersModel
from prompt import OS_SYSTEM_PROMPT
# --- Configuration ---
MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"
# --- Model and Processor Loading (Load once) ---
print(f"Loading model and processor for {MODEL_ID}...")
model = None
processor = None
model_loaded = False
load_error_message = ""
model = TransformersModel(
model_id=MODEL_ID,
to_device="cuda:0",
)
title = "Smol2Operator Demo"
description = """
This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face:
"""
SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT
def get_navigation_prompt(task, image, step=1):
"""
Get the prompt for the navigation task.
- task: The task to complete
- image: The current screenshot of the web page
- step: The current step of the task
"""
system_prompt = SYSTEM_PROMPT
return [
{
"role": "system",
"content": [
{"type": "text", "text": system_prompt},
],
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
],
},
]
def array_to_image(image_array: np.ndarray) -> Image.Image:
if image_array is None:
raise ValueError("No image provided. Please upload an image before submitting.")
# Convert numpy array to PIL Image
img = Image.fromarray(np.uint8(image_array))
return img
def parse_actions_from_response(response: str) -> list[str]:
"""Parse actions from model response using regex pattern."""
pattern = r"<code>\n(.*?)\n</code>"
matches = re.findall(pattern, response, re.DOTALL)
return matches
def extract_coordinates_from_action(action_code: str) -> list[dict]:
"""Extract coordinates from action code for localization actions."""
localization_actions = []
# Patterns for different action types
patterns = {
'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
}
for action_type, pattern in patterns.items():
matches = re.finditer(pattern, action_code)
for match in matches:
if action_type == 'drag':
# Drag has from and to coordinates
from_x, from_y, to_x, to_y = match.groups()
localization_actions.append({
'type': 'drag_from',
'x': float(from_x),
'y': float(from_y),
'action': action_type
})
localization_actions.append({
'type': 'drag_to',
'x': float(to_x),
'y': float(to_y),
'action': action_type
})
else:
# Single coordinate actions
x_val = match.group(1)
y_val = match.group(2) if match.group(2) else x_val # Handle single coordinate case
if x_val and y_val:
localization_actions.append({
'type': action_type,
'x': float(x_val),
'y': float(y_val),
'action': action_type
})
return localization_actions
def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
"""Create an image with localization markers drawn on it."""
if not coordinates:
return None
# Create a copy of the original image
img_copy = original_image.copy()
draw = ImageDraw.Draw(img_copy)
# Get image dimensions
width, height = img_copy.size
# Try to load a font, fallback to default if not available
font = ImageFont.load_default()
# Color scheme for different actions
colors = {
'click': 'red',
'double_click': 'blue',
'move_mouse': 'green',
'drag_from': 'orange',
'drag_to': 'purple'
}
for i, coord in enumerate(coordinates):
# Convert normalized coordinates to pixel coordinates
pixel_x = int(coord['x'] * width)
pixel_y = int(coord['y'] * height)
# Get color for this action type
color = colors.get(coord['type'], 'red')
# Draw a circle at the coordinate
circle_radius = 8
draw.ellipse([
pixel_x - circle_radius, pixel_y - circle_radius,
pixel_x + circle_radius, pixel_y + circle_radius
], fill=color, outline='white', width=2)
# Add text label
label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
if font:
draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
else:
draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)
# For drag actions, draw an arrow
if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
next_coord = coordinates[i + 1]
end_x = int(next_coord['x'] * width)
end_y = int(next_coord['y'] * height)
# Draw arrow line
draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
# Draw arrowhead
arrow_size = 10
dx = end_x - pixel_x
dy = end_y - pixel_y
length = (dx**2 + dy**2)**0.5
if length > 0:
dx_norm = dx / length
dy_norm = dy / length
# Arrowhead points
arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5
draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')
return img_copy
# --- Gradio processing function ---
@spaces.GPU
def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
input_pil_image = array_to_image(input_numpy_image)
assert isinstance(input_pil_image, Image.Image)
prompt = get_navigation_prompt(task, input_pil_image)
if model is None:
raise ValueError("Model not loaded")
navigation_str = model.generate(prompt, max_new_tokens=500)
print(f"Navigation string: {navigation_str}")
navigation_str = navigation_str.strip()
# Parse actions from the response
actions = parse_actions_from_response(navigation_str)
# Extract coordinates from all actions
all_coordinates = []
for action_code in actions:
coordinates = extract_coordinates_from_action(action_code)
all_coordinates.extend(coordinates)
# Create localized image if there are coordinates
localized_image = None
if all_coordinates:
localized_image = create_localized_image(input_pil_image, all_coordinates)
print(f"Found {len(all_coordinates)} localization actions")
return navigation_str, localized_image
# --- Load Example Data ---
example_1_image: str = "./assets/google.png"
example_1_image = Image.open(example_1_image)
example_1_task = "Search for the name of the current UK Prime Minister."
example_2_image: str = "./assets/huggingface.png"
example_2_image = Image.open(example_2_image)
example_2_task = "Find the most trending model."
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
# gr.Markdown(description)
with gr.Row():
input_image_component = gr.Image(label="UI Image", height=500)
with gr.Row():
with gr.Column():
task_component = gr.Textbox(
label="task",
placeholder="e.g., Search for the name of the current UK Prime Minister.",
info="Type the task you want the model to complete.",
)
submit_button = gr.Button("Call Agent", variant="primary")
with gr.Column():
output_coords_component = gr.Textbox(label="Agent Output", lines=10)
submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component])
gr.Examples(
examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
inputs=[input_image_component, task_component],
outputs=[output_coords_component, input_image_component],
fn=navigate,
cache_examples=True,
)
demo.queue(api_open=False)
demo.launch(debug=True)