diff --git a/main_workarena_debug.py b/main_workarena_debug.py new file mode 100644 index 00000000..53077ac4 --- /dev/null +++ b/main_workarena_debug.py @@ -0,0 +1,77 @@ +""" +Note: This script is a convenience script to launch experiments instead of using +the command line. + +Copy this script and modify at will, but don't push your changes to the +repository. +""" + +import logging +from copy import deepcopy + +import bgym + +from agentlab.agents.tool_use_agent.tool_use_agent import ( + DEFAULT_PROMPT_CONFIG, + GPT_4_1, + ToolUseAgentArgs, +) +from agentlab.experiments.study import Study + +logging.getLogger().setLevel(logging.INFO) + +config = deepcopy(DEFAULT_PROMPT_CONFIG) +# config.keep_last_n_obs = 1 +config.obs.use_som = True + + +agent_configs = [ + ToolUseAgentArgs( + model_args=GPT_4_1, + config=config, + ), + # ToolUseAgentArgs( + # model_args=GPT_4_1, + # config=config, + # ), +] + +for agent_config in agent_configs: + agent_config.config.action_subsets = ("workarena",) # use the workarena action set + + +# ## select the benchmark to run on +# benchmark = "miniwob_tiny_test" +benchmark = "workarena_l1" + + +benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4) # type: bgym.Benchmark +benchmark = benchmark.subset_from_glob("task_name", "*create*") + +# for env_args in benchmark.env_args_list: +# print(env_args.task_name) +# env_args.max_steps = 15 + +relaunch = False + +## Number of parallel jobs +n_jobs = 10 # Make sure to use 1 job when debugging in VSCode +parallel_backend = "ray" +# parallel_backend = "sequential" # activate sequential backend for debugging in VSCode + +if __name__ == "__main__": # necessary for dask backend + + if relaunch: + # relaunch an existing study + study = Study.load_most_recent(contains=None) + study.find_incomplete(include_errors=True) + + else: + study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING) + + study.run( + n_jobs=n_jobs, + parallel_backend=parallel_backend, # "ray", "joblib" or "sequential" + strict_reproducibility=False, + n_relaunch=3, + ) diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 28b97e82..3d506bdb 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -127,8 +127,10 @@ class Goal(Block): goal_as_system_msg: bool = True - def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict: - system_message = llm.msg.system().add_text(SYS_MSG) + def apply( + self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG + ) -> dict: + system_message = llm.msg.system().add_text(sys_msg) discussion.append(system_message) if self.goal_as_system_msg: @@ -441,7 +443,13 @@ def get_action(self, obs: Any) -> float: self.llm.reset_stats() if not self.discussion.is_goal_set(): self.discussion.new_group("goal") - self.config.goal.apply(self.llm, self.discussion, obs) + + if self.config.multiaction: + sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed." + else: + sys_msg = SYS_MSG + "\nYou can only take one action at a time." + self.config.goal.apply(self.llm, self.discussion, obs, sys_msg) + self.config.summarizer.apply_init(self.llm, self.discussion) self.config.general_hints.apply(self.llm, self.discussion) self.task_hint.apply(self.llm, self.discussion, self.task_name) @@ -489,7 +497,7 @@ def get_action(self, obs: Any) -> float: return action, agent_info -OPENAI_MODEL_CONFIG = OpenAIResponseModelArgs( +GPT_4_1 = OpenAIResponseModelArgs( model_name="gpt-4.1", max_total_tokens=200_000, max_input_tokens=200_000, diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index ea5371b9..bd1c6ad4 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -1164,7 +1164,7 @@ def get_directory_contents(results_dir: Path): most_recent_summary = max(summary_files, key=os.path.getctime) summary_df = pd.read_csv(most_recent_summary) - if len(summary_df) == 0 or summary_df["avg_reward"].isna().all(): + if len(summary_df) == 0: continue # skip if all avg_reward are NaN # get row with max avg_reward diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py index 4649a962..51ff61c3 100644 --- a/src/agentlab/analyze/overlay_utils.py +++ b/src/agentlab/analyze/overlay_utils.py @@ -1,9 +1,11 @@ import ast import inspect +import math from dataclasses import dataclass from typing import Any, Union import matplotlib.pyplot as plt +import PIL from browsergym.core.action.highlevel import ACTION_SUBSETS from PIL import Image, ImageDraw @@ -289,17 +291,54 @@ def overlay_rectangle( bbox: tuple[float, float, float, float], color: Union[str, tuple[int, int, int]] = "red", width: int = 1, + dashed: bool = True, ) -> Image.Image: draw = ImageDraw.Draw(img) x, y, w, h = bbox - # Draw rectangle outline - draw.rectangle([x, y, x + w, y + h], outline=color, width=width) + if dashed: + # Draw dashed rectangle + print("Drawing dashed rectangle") + linedashed(draw, x, y, x + w, y, color, width) + linedashed(draw, x + w, y, x + w, y + h, color, width) + linedashed(draw, x + w, y + h, x, y + h, color, width) + linedashed(draw, x, y + h, x, y, color, width) + else: + draw.rectangle([x, y, x + w, y + h], outline=color, width=width) return img +# Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306 +def linedashed( + draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dash_length=4, nodash_length=8 +): + line_dx = x1 - x0 # delta x (can be negative) + line_dy = y1 - y0 # delta y (can be negative) + line_length = math.hypot(line_dx, line_dy) # line length (positive) + if line_length == 0: + return # Avoid division by zero in case the line length is 0 + pixel_dx = line_dx / line_length # x add for 1px line length + pixel_dy = line_dy / line_length # y add for 1px line length + dash_start = 0 + while dash_start < line_length: + dash_end = dash_start + dash_length + if dash_end > line_length: + dash_end = line_length + draw.line( + ( + round(x0 + pixel_dx * dash_start), + round(y0 + pixel_dy * dash_start), + round(x0 + pixel_dx * dash_end), + round(y0 + pixel_dy * dash_end), + ), + fill=fill, + width=width, + ) + dash_start += dash_length + nodash_length + + def annotate_action( img: Image.Image, action_string: str, properties: dict[str, tuple], colormap: str = "tab10" ) -> str: diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py index 06f346a2..8e6f3695 100644 --- a/src/agentlab/llm/response_api.py +++ b/src/agentlab/llm/response_api.py @@ -313,7 +313,6 @@ def __init__( **kwargs, ): self.tools = kwargs.pop("tools", None) - self.tool_choice = kwargs.pop("tool_choice", None) super().__init__( model_name=model_name, api_key=api_key, @@ -324,7 +323,9 @@ def __init__( ) self.client = OpenAI(api_key=api_key) - def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict: + def _call_api( + self, messages: list[Any | MessageBuilder], tool_choice: str = "auto", **kwargs + ) -> dict: input = [] for msg in messages: input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg]) @@ -339,8 +340,10 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict: if self.tools is not None: api_params["tools"] = self.tools - if self.tool_choice is not None: - api_params["tool_choice"] = self.tool_choice + if tool_choice in ("any", "required"): + tool_choice = "required" + + api_params["tool_choice"] = tool_choice # api_params |= kwargs # Merge any additional parameters passed response = call_openai_api_with_retries( @@ -388,7 +391,6 @@ def __init__( ): self.tools = self.format_tools_for_chat_completion(kwargs.pop("tools", None)) - self.tool_choice = kwargs.pop("tool_choice", None) super().__init__( model_name=model_name, @@ -403,7 +405,9 @@ def __init__( **client_args ) # Ensures client_args is a dict or defaults to an empty dict - def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.ChatCompletion: + def _call_api( + self, messages: list[dict | MessageBuilder], tool_choice: str = "auto" + ) -> openai.types.chat.ChatCompletion: input = [] for msg in messages: input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg]) @@ -416,8 +420,10 @@ def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat. } if self.tools is not None: api_params["tools"] = self.tools - if self.tool_choice is not None: - api_params["tool_choice"] = self.tool_choice + + if tool_choice in ("any", "required"): + tool_choice = "required" + api_params["tool_choice"] = tool_choice response = call_openai_api_with_retries(self.client.chat.completions.create, api_params) @@ -517,7 +523,6 @@ def __init__( **kwargs, ): self.tools = kwargs.pop("tools", None) - self.tool_choice = kwargs.pop("tool_choice", None) super().__init__( model_name=model_name, @@ -543,6 +548,9 @@ def _call_api( temp = self.apply_cache_breakpoints(msg, temp) input.extend(temp) + if tool_choice in ("any", "required"): + tool_choice = "any" # Claude API expects "any" and gpt expects "required" + api_params: Dict[str, Any] = { "model": self.model_name, "messages": input,