From d7259d23233efeef900becccff78221ea6763e42 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 10 Jul 2025 13:47:17 -0400 Subject: [PATCH 1/9] script to debug workarena --- main_workarena_debug.py | 69 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 main_workarena_debug.py diff --git a/main_workarena_debug.py b/main_workarena_debug.py new file mode 100644 index 00000000..c86dd8cb --- /dev/null +++ b/main_workarena_debug.py @@ -0,0 +1,69 @@ +""" +Note: This script is a convenience script to launch experiments instead of using +the command line. + +Copy this script and modify at will, but don't push your changes to the +repository. +""" + +import logging + +import bgym +from bgym import DEFAULT_BENCHMARKS + +from agentlab.agents.tool_use_agent.tool_use_agent import ( + DEFAULT_PROMPT_CONFIG, + GPT_4_1_MINI, + OPENAI_MODEL_CONFIG, + ToolUseAgentArgs, +) +from agentlab.experiments.study import Study + +logging.getLogger().setLevel(logging.INFO) + +agent_config = ToolUseAgentArgs( + model_args=OPENAI_MODEL_CONFIG, + config=GPT_4_1_MINI, +) + + +agent_config.config.action_subsets = ("workarena",) # use the workarena action set + +agent_args = [agent_config] + + +# ## select the benchmark to run on +# benchmark = "miniwob_tiny_test" +benchmark = "workarena_l1" + + +benchmark = DEFAULT_BENCHMARKS[benchmark](n_seeds=1) # type: bgym.Benchmark +benchmark = benchmark.subset_from_glob("task_name", "*create*") + +# for env_args in benchmark.env_args_list: +# print(env_args.task_name) +# env_args.max_steps = 15 + +relaunch = False + +## Number of parallel jobs +n_jobs = 10 # Make sure to use 1 job when debugging in VSCode +parallel_backend = "ray" +parallel_backend = "sequential" + +if __name__ == "__main__": # necessary for dask backend + + if relaunch: + # relaunch an existing study + study = Study.load_most_recent(contains=None) + study.find_incomplete(include_errors=True) + + else: + study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING) + + study.run( + n_jobs=n_jobs, + parallel_backend=parallel_backend, # "ray", "joblib" or "sequential" + strict_reproducibility=False, + n_relaunch=3, + ) From 64f34f3e52fddb197b82dd707d8952cd77326a9a Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 10 Jul 2025 13:48:02 -0400 Subject: [PATCH 2/9] Remove commented sequential backend option for clarity in parallel job configuration --- main_workarena_debug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main_workarena_debug.py b/main_workarena_debug.py index c86dd8cb..036a2ad4 100644 --- a/main_workarena_debug.py +++ b/main_workarena_debug.py @@ -49,7 +49,7 @@ ## Number of parallel jobs n_jobs = 10 # Make sure to use 1 job when debugging in VSCode parallel_backend = "ray" -parallel_backend = "sequential" +# parallel_backend = "sequential" # activate sequential backend for debugging in VSCode if __name__ == "__main__": # necessary for dask backend From bacade9f61c56854a639e91b429f7a7f4270f562 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 10 Jul 2025 13:50:39 -0400 Subject: [PATCH 3/9] some cleanup --- main_workarena_debug.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/main_workarena_debug.py b/main_workarena_debug.py index 036a2ad4..2042bbd8 100644 --- a/main_workarena_debug.py +++ b/main_workarena_debug.py @@ -9,7 +9,6 @@ import logging import bgym -from bgym import DEFAULT_BENCHMARKS from agentlab.agents.tool_use_agent.tool_use_agent import ( DEFAULT_PROMPT_CONFIG, @@ -22,8 +21,8 @@ logging.getLogger().setLevel(logging.INFO) agent_config = ToolUseAgentArgs( - model_args=OPENAI_MODEL_CONFIG, - config=GPT_4_1_MINI, + model_args=GPT_4_1_MINI, + config=DEFAULT_PROMPT_CONFIG, ) @@ -37,7 +36,7 @@ benchmark = "workarena_l1" -benchmark = DEFAULT_BENCHMARKS[benchmark](n_seeds=1) # type: bgym.Benchmark +benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]() # type: bgym.Benchmark benchmark = benchmark.subset_from_glob("task_name", "*create*") # for env_args in benchmark.env_args_list: From 16b525115727d78b8d56c2124a54b44bc689a52b Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 11 Jul 2025 13:52:16 -0400 Subject: [PATCH 4/9] multi_action_hint --- .../agents/tool_use_agent/tool_use_agent.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 28b97e82..3d506bdb 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -127,8 +127,10 @@ class Goal(Block): goal_as_system_msg: bool = True - def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict: - system_message = llm.msg.system().add_text(SYS_MSG) + def apply( + self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG + ) -> dict: + system_message = llm.msg.system().add_text(sys_msg) discussion.append(system_message) if self.goal_as_system_msg: @@ -441,7 +443,13 @@ def get_action(self, obs: Any) -> float: self.llm.reset_stats() if not self.discussion.is_goal_set(): self.discussion.new_group("goal") - self.config.goal.apply(self.llm, self.discussion, obs) + + if self.config.multiaction: + sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed." + else: + sys_msg = SYS_MSG + "\nYou can only take one action at a time." + self.config.goal.apply(self.llm, self.discussion, obs, sys_msg) + self.config.summarizer.apply_init(self.llm, self.discussion) self.config.general_hints.apply(self.llm, self.discussion) self.task_hint.apply(self.llm, self.discussion, self.task_name) @@ -489,7 +497,7 @@ def get_action(self, obs: Any) -> float: return action, agent_info -OPENAI_MODEL_CONFIG = OpenAIResponseModelArgs( +GPT_4_1 = OpenAIResponseModelArgs( model_name="gpt-4.1", max_total_tokens=200_000, max_input_tokens=200_000, From 0fc17328c48f9d6a276914ce612e87d9dd9332ce Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 11 Jul 2025 13:53:19 -0400 Subject: [PATCH 5/9] adding dash_line to action overlay --- src/agentlab/analyze/overlay_utils.py | 43 +++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py index 4649a962..51ff61c3 100644 --- a/src/agentlab/analyze/overlay_utils.py +++ b/src/agentlab/analyze/overlay_utils.py @@ -1,9 +1,11 @@ import ast import inspect +import math from dataclasses import dataclass from typing import Any, Union import matplotlib.pyplot as plt +import PIL from browsergym.core.action.highlevel import ACTION_SUBSETS from PIL import Image, ImageDraw @@ -289,17 +291,54 @@ def overlay_rectangle( bbox: tuple[float, float, float, float], color: Union[str, tuple[int, int, int]] = "red", width: int = 1, + dashed: bool = True, ) -> Image.Image: draw = ImageDraw.Draw(img) x, y, w, h = bbox - # Draw rectangle outline - draw.rectangle([x, y, x + w, y + h], outline=color, width=width) + if dashed: + # Draw dashed rectangle + print("Drawing dashed rectangle") + linedashed(draw, x, y, x + w, y, color, width) + linedashed(draw, x + w, y, x + w, y + h, color, width) + linedashed(draw, x + w, y + h, x, y + h, color, width) + linedashed(draw, x, y + h, x, y, color, width) + else: + draw.rectangle([x, y, x + w, y + h], outline=color, width=width) return img +# Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306 +def linedashed( + draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dash_length=4, nodash_length=8 +): + line_dx = x1 - x0 # delta x (can be negative) + line_dy = y1 - y0 # delta y (can be negative) + line_length = math.hypot(line_dx, line_dy) # line length (positive) + if line_length == 0: + return # Avoid division by zero in case the line length is 0 + pixel_dx = line_dx / line_length # x add for 1px line length + pixel_dy = line_dy / line_length # y add for 1px line length + dash_start = 0 + while dash_start < line_length: + dash_end = dash_start + dash_length + if dash_end > line_length: + dash_end = line_length + draw.line( + ( + round(x0 + pixel_dx * dash_start), + round(y0 + pixel_dy * dash_start), + round(x0 + pixel_dx * dash_end), + round(y0 + pixel_dy * dash_end), + ), + fill=fill, + width=width, + ) + dash_start += dash_length + nodash_length + + def annotate_action( img: Image.Image, action_string: str, properties: dict[str, tuple], colormap: str = "tab10" ) -> str: From 1f76e35b8ff6796906a22af23bfdd3738804bcc9 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 11 Jul 2025 13:54:45 -0400 Subject: [PATCH 6/9] minor convenient updates --- _sandbox | 35 +++++++++++++++++++++++++++++++++++ main_workarena_debug.py | 29 +++++++++++++++++++---------- 2 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 _sandbox diff --git a/_sandbox b/_sandbox new file mode 100644 index 00000000..6036d436 --- /dev/null +++ b/_sandbox @@ -0,0 +1,35 @@ +# import gradio as gr +# import pandas as pd + +# df = pd.DataFrame({"A": [14, 4, 5, 4, 1], "B": [5, 2, 54, 3, 2], "C": [20, 20, 7, 3, 8]}) + + +# # Highlight entire rows based on conditions +# def highlight_rows(row): +# if row["A"] > 4: +# return ["background-color: darkblue"] * len(row) +# else: +# return [""] * len(row) + + +# styler = df.style.apply(highlight_rows, axis=1) + +# with gr.Blocks() as demo: +# gr.Dataframe(styler) +# demo.launch() + + +def format_function_call(func_name, kwargs=None): + """Format a function name and kwargs dict into a Python function call string.""" + if kwargs is None: + kwargs = {} + + if not kwargs: + return f"{func_name}()" + + args_str = ", ".join(f"{key}={repr(value)}" for key, value in kwargs.items()) + return f"{func_name}({args_str})" + + +print(format_function_call("my_function", {"arg1": 42, "arg2": """Marc's 17" laptop"""})) +print(format_function_call("my_function", {"arg1": 42, "arg2": "17' pole"})) diff --git a/main_workarena_debug.py b/main_workarena_debug.py index 2042bbd8..53077ac4 100644 --- a/main_workarena_debug.py +++ b/main_workarena_debug.py @@ -7,28 +7,37 @@ """ import logging +from copy import deepcopy import bgym from agentlab.agents.tool_use_agent.tool_use_agent import ( DEFAULT_PROMPT_CONFIG, - GPT_4_1_MINI, - OPENAI_MODEL_CONFIG, + GPT_4_1, ToolUseAgentArgs, ) from agentlab.experiments.study import Study logging.getLogger().setLevel(logging.INFO) -agent_config = ToolUseAgentArgs( - model_args=GPT_4_1_MINI, - config=DEFAULT_PROMPT_CONFIG, -) +config = deepcopy(DEFAULT_PROMPT_CONFIG) +# config.keep_last_n_obs = 1 +config.obs.use_som = True -agent_config.config.action_subsets = ("workarena",) # use the workarena action set +agent_configs = [ + ToolUseAgentArgs( + model_args=GPT_4_1, + config=config, + ), + # ToolUseAgentArgs( + # model_args=GPT_4_1, + # config=config, + # ), +] -agent_args = [agent_config] +for agent_config in agent_configs: + agent_config.config.action_subsets = ("workarena",) # use the workarena action set # ## select the benchmark to run on @@ -36,7 +45,7 @@ benchmark = "workarena_l1" -benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]() # type: bgym.Benchmark +benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4) # type: bgym.Benchmark benchmark = benchmark.subset_from_glob("task_name", "*create*") # for env_args in benchmark.env_args_list: @@ -58,7 +67,7 @@ study.find_incomplete(include_errors=True) else: - study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING) + study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING) study.run( n_jobs=n_jobs, From c4d3c4a3e99511c7d70ece9193a776321f9f17e4 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 11 Jul 2025 16:40:05 -0400 Subject: [PATCH 7/9] Fix openai returning no tool calls Refactor OpenAIResponseModel and related classes to remove tool_choice attribute and streamline API calls --- src/agentlab/llm/response_api.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py index 06f346a2..8e6f3695 100644 --- a/src/agentlab/llm/response_api.py +++ b/src/agentlab/llm/response_api.py @@ -313,7 +313,6 @@ def __init__( **kwargs, ): self.tools = kwargs.pop("tools", None) - self.tool_choice = kwargs.pop("tool_choice", None) super().__init__( model_name=model_name, api_key=api_key, @@ -324,7 +323,9 @@ def __init__( ) self.client = OpenAI(api_key=api_key) - def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict: + def _call_api( + self, messages: list[Any | MessageBuilder], tool_choice: str = "auto", **kwargs + ) -> dict: input = [] for msg in messages: input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg]) @@ -339,8 +340,10 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict: if self.tools is not None: api_params["tools"] = self.tools - if self.tool_choice is not None: - api_params["tool_choice"] = self.tool_choice + if tool_choice in ("any", "required"): + tool_choice = "required" + + api_params["tool_choice"] = tool_choice # api_params |= kwargs # Merge any additional parameters passed response = call_openai_api_with_retries( @@ -388,7 +391,6 @@ def __init__( ): self.tools = self.format_tools_for_chat_completion(kwargs.pop("tools", None)) - self.tool_choice = kwargs.pop("tool_choice", None) super().__init__( model_name=model_name, @@ -403,7 +405,9 @@ def __init__( **client_args ) # Ensures client_args is a dict or defaults to an empty dict - def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.ChatCompletion: + def _call_api( + self, messages: list[dict | MessageBuilder], tool_choice: str = "auto" + ) -> openai.types.chat.ChatCompletion: input = [] for msg in messages: input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg]) @@ -416,8 +420,10 @@ def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat. } if self.tools is not None: api_params["tools"] = self.tools - if self.tool_choice is not None: - api_params["tool_choice"] = self.tool_choice + + if tool_choice in ("any", "required"): + tool_choice = "required" + api_params["tool_choice"] = tool_choice response = call_openai_api_with_retries(self.client.chat.completions.create, api_params) @@ -517,7 +523,6 @@ def __init__( **kwargs, ): self.tools = kwargs.pop("tools", None) - self.tool_choice = kwargs.pop("tool_choice", None) super().__init__( model_name=model_name, @@ -543,6 +548,9 @@ def _call_api( temp = self.apply_cache_breakpoints(msg, temp) input.extend(temp) + if tool_choice in ("any", "required"): + tool_choice = "any" # Claude API expects "any" and gpt expects "required" + api_params: Dict[str, Any] = { "model": self.model_name, "messages": input, From 73c64ce351880b6710cb9805bc353c814f1afff8 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 11 Jul 2025 16:40:55 -0400 Subject: [PATCH 8/9] minor for loading weird studies --- src/agentlab/analyze/agent_xray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index ea5371b9..bd1c6ad4 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -1164,7 +1164,7 @@ def get_directory_contents(results_dir: Path): most_recent_summary = max(summary_files, key=os.path.getctime) summary_df = pd.read_csv(most_recent_summary) - if len(summary_df) == 0 or summary_df["avg_reward"].isna().all(): + if len(summary_df) == 0: continue # skip if all avg_reward are NaN # get row with max avg_reward From 6a4c808b164fcc839e918d7fa2ac5530c1bc90de Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 11 Jul 2025 16:43:09 -0400 Subject: [PATCH 9/9] removing something that should not be ther --- _sandbox | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 _sandbox diff --git a/_sandbox b/_sandbox deleted file mode 100644 index 6036d436..00000000 --- a/_sandbox +++ /dev/null @@ -1,35 +0,0 @@ -# import gradio as gr -# import pandas as pd - -# df = pd.DataFrame({"A": [14, 4, 5, 4, 1], "B": [5, 2, 54, 3, 2], "C": [20, 20, 7, 3, 8]}) - - -# # Highlight entire rows based on conditions -# def highlight_rows(row): -# if row["A"] > 4: -# return ["background-color: darkblue"] * len(row) -# else: -# return [""] * len(row) - - -# styler = df.style.apply(highlight_rows, axis=1) - -# with gr.Blocks() as demo: -# gr.Dataframe(styler) -# demo.launch() - - -def format_function_call(func_name, kwargs=None): - """Format a function name and kwargs dict into a Python function call string.""" - if kwargs is None: - kwargs = {} - - if not kwargs: - return f"{func_name}()" - - args_str = ", ".join(f"{key}={repr(value)}" for key, value in kwargs.items()) - return f"{func_name}({args_str})" - - -print(format_function_call("my_function", {"arg1": 42, "arg2": """Marc's 17" laptop"""})) -print(format_function_call("my_function", {"arg1": 42, "arg2": "17' pole"}))