Spaces:

A-Mahla
/

Smol2Operator

Running on Zero

App Files Files Community

Smol2Operator / app.py

A-Mahla HF Staff

Update app.py (#1)

57a1dfe verified about 2 months ago

raw

history blame

9.74 kB

	import spaces

	import re
	from typing import Tuple, Optional

	import gradio as gr
	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	from smolvlm_inference import TransformersModel

	from prompt import OS_SYSTEM_PROMPT

	# --- Configuration ---
	MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"

	# --- Model and Processor Loading (Load once) ---
	print(f"Loading model and processor for {MODEL_ID}...")
	model = None
	processor = None
	model_loaded = False
	load_error_message = ""



	model = TransformersModel(
	model_id=MODEL_ID,
	to_device="cuda:0",
	)


	title = "Smol2Operator Demo"

	description = """
	This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
	This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
	This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face:
	"""



	SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT


	def get_navigation_prompt(task, image, step=1):
	"""
	Get the prompt for the navigation task.
	- task: The task to complete
	- image: The current screenshot of the web page
	- step: The current step of the task
	"""
	system_prompt = SYSTEM_PROMPT
	return [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": system_prompt},
	],
	},
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": image,
	},
	{"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
	],
	},
	]


	def array_to_image(image_array: np.ndarray) -> Image.Image:
	if image_array is None:
	raise ValueError("No image provided. Please upload an image before submitting.")
	# Convert numpy array to PIL Image
	img = Image.fromarray(np.uint8(image_array))
	return img


	def parse_actions_from_response(response: str) -> list[str]:
	"""Parse actions from model response using regex pattern."""
	pattern = r"<code>\n(.*?)\n</code>"
	matches = re.findall(pattern, response, re.DOTALL)
	return matches


	def extract_coordinates_from_action(action_code: str) -> list[dict]:
	"""Extract coordinates from action code for localization actions."""
	localization_actions = []

	# Patterns for different action types
	patterns = {
	'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
	'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
	'move_mouse': r'move_mouse\((?:self,\s)?(?:x=)?([0-9.]+)(?:,\s(?:y=)?([0-9.]+))\)',
	'drag': r'drag\(\[([0-9.]+),\s([0-9.]+)\],\s\[([0-9.]+),\s*([0-9.]+)\]\)'
	}

	for action_type, pattern in patterns.items():
	matches = re.finditer(pattern, action_code)
	for match in matches:
	if action_type == 'drag':
	# Drag has from and to coordinates
	from_x, from_y, to_x, to_y = match.groups()
	localization_actions.append({
	'type': 'drag_from',
	'x': float(from_x),
	'y': float(from_y),
	'action': action_type
	})
	localization_actions.append({
	'type': 'drag_to',
	'x': float(to_x),
	'y': float(to_y),
	'action': action_type
	})
	else:
	# Single coordinate actions
	x_val = match.group(1)
	y_val = match.group(2) if match.group(2) else x_val # Handle single coordinate case
	if x_val and y_val:
	localization_actions.append({
	'type': action_type,
	'x': float(x_val),
	'y': float(y_val),
	'action': action_type
	})

	return localization_actions


	def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
	"""Create an image with localization markers drawn on it."""
	if not coordinates:
	return None

	# Create a copy of the original image
	img_copy = original_image.copy()
	draw = ImageDraw.Draw(img_copy)

	# Get image dimensions
	width, height = img_copy.size

	# Try to load a font, fallback to default if not available
	font = ImageFont.load_default()


	# Color scheme for different actions
	colors = {
	'click': 'red',
	'double_click': 'blue',
	'move_mouse': 'green',
	'drag_from': 'orange',
	'drag_to': 'purple'
	}

	for i, coord in enumerate(coordinates):
	# Convert normalized coordinates to pixel coordinates
	pixel_x = int(coord['x'] * width)
	pixel_y = int(coord['y'] * height)

	# Get color for this action type
	color = colors.get(coord['type'], 'red')

	# Draw a circle at the coordinate
	circle_radius = 8
	draw.ellipse([
	pixel_x - circle_radius, pixel_y - circle_radius,
	pixel_x + circle_radius, pixel_y + circle_radius
	], fill=color, outline='white', width=2)

	# Add text label
	label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
	if font:
	draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
	else:
	draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)

	# For drag actions, draw an arrow
	if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
	next_coord = coordinates[i + 1]
	end_x = int(next_coord['x'] * width)
	end_y = int(next_coord['y'] * height)

	# Draw arrow line
	draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)

	# Draw arrowhead
	arrow_size = 10
	dx = end_x - pixel_x
	dy = end_y - pixel_y
	length = (dx2 + dy2)**0.5
	if length > 0:
	dx_norm = dx / length
	dy_norm = dy / length

	# Arrowhead points
	arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
	arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
	arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
	arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5

	draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')

	return img_copy


	# --- Gradio processing function ---
	@spaces.GPU
	def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
	input_pil_image = array_to_image(input_numpy_image)
	assert isinstance(input_pil_image, Image.Image)

	prompt = get_navigation_prompt(task, input_pil_image)


	if model is None:
	raise ValueError("Model not loaded")

	navigation_str = model.generate(prompt, max_new_tokens=500)
	print(f"Navigation string: {navigation_str}")
	navigation_str = navigation_str.strip()

	# Parse actions from the response
	actions = parse_actions_from_response(navigation_str)

	# Extract coordinates from all actions
	all_coordinates = []
	for action_code in actions:
	coordinates = extract_coordinates_from_action(action_code)
	all_coordinates.extend(coordinates)

	# Create localized image if there are coordinates
	localized_image = None
	if all_coordinates:
	localized_image = create_localized_image(input_pil_image, all_coordinates)
	print(f"Found {len(all_coordinates)} localization actions")

	return navigation_str, localized_image


	# --- Load Example Data ---
	example_1_image: str = "./assets/google.png"
	example_1_image = Image.open(example_1_image)
	example_1_task = "Search for the name of the current UK Prime Minister."

	example_2_image: str = "./assets/huggingface.png"
	example_2_image = Image.open(example_2_image)
	example_2_task = "Find the most trending model."


	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
	# gr.Markdown(description)

	with gr.Row():
	input_image_component = gr.Image(label="UI Image", height=500)
	with gr.Row():
	with gr.Column():
	task_component = gr.Textbox(
	label="task",
	placeholder="e.g., Search for the name of the current UK Prime Minister.",
	info="Type the task you want the model to complete.",
	)
	submit_button = gr.Button("Call Agent", variant="primary")

	with gr.Column():
	output_coords_component = gr.Textbox(label="Agent Output", lines=10)

	submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component])

	gr.Examples(
	examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
	inputs=[input_image_component, task_component],
	outputs=[output_coords_component, input_image_component],
	fn=navigate,
	cache_examples=True,
	)

	demo.queue(api_open=False)
	demo.launch(debug=True)