From d7259d23233efeef900becccff78221ea6763e42 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 10 Jul 2025 13:47:17 -0400
Subject: [PATCH 1/9] script to debug workarena

---
 main_workarena_debug.py | 69 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 main_workarena_debug.py

diff --git a/main_workarena_debug.py b/main_workarena_debug.py
new file mode 100644
index 00000000..c86dd8cb
--- /dev/null
+++ b/main_workarena_debug.py
@@ -0,0 +1,69 @@
+"""
+Note: This script is a convenience script to launch experiments instead of using
+the command line.
+
+Copy this script and modify at will, but don't push your changes to the
+repository.
+"""
+
+import logging
+
+import bgym
+from bgym import DEFAULT_BENCHMARKS
+
+from agentlab.agents.tool_use_agent.tool_use_agent import (
+    DEFAULT_PROMPT_CONFIG,
+    GPT_4_1_MINI,
+    OPENAI_MODEL_CONFIG,
+    ToolUseAgentArgs,
+)
+from agentlab.experiments.study import Study
+
+logging.getLogger().setLevel(logging.INFO)
+
+agent_config = ToolUseAgentArgs(
+    model_args=OPENAI_MODEL_CONFIG,
+    config=GPT_4_1_MINI,
+)
+
+
+agent_config.config.action_subsets = ("workarena",)  # use the workarena action set
+
+agent_args = [agent_config]
+
+
+# ## select the benchmark to run on
+# benchmark = "miniwob_tiny_test"
+benchmark = "workarena_l1"
+
+
+benchmark = DEFAULT_BENCHMARKS[benchmark](n_seeds=1)  # type: bgym.Benchmark
+benchmark = benchmark.subset_from_glob("task_name", "*create*")
+
+# for env_args in benchmark.env_args_list:
+#     print(env_args.task_name)
+#     env_args.max_steps = 15
+
+relaunch = False
+
+## Number of parallel jobs
+n_jobs = 10  # Make sure to use 1 job when debugging in VSCode
+parallel_backend = "ray"
+parallel_backend = "sequential"
+
+if __name__ == "__main__":  # necessary for dask backend
+
+    if relaunch:
+        #  relaunch an existing study
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
+
+    else:
+        study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend=parallel_backend,  # "ray", "joblib" or "sequential"
+        strict_reproducibility=False,
+        n_relaunch=3,
+    )

From 64f34f3e52fddb197b82dd707d8952cd77326a9a Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 10 Jul 2025 13:48:02 -0400
Subject: [PATCH 2/9] Remove commented sequential backend option for clarity in
 parallel job configuration

---
 main_workarena_debug.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main_workarena_debug.py b/main_workarena_debug.py
index c86dd8cb..036a2ad4 100644
--- a/main_workarena_debug.py
+++ b/main_workarena_debug.py
@@ -49,7 +49,7 @@
 ## Number of parallel jobs
 n_jobs = 10  # Make sure to use 1 job when debugging in VSCode
 parallel_backend = "ray"
-parallel_backend = "sequential"
+# parallel_backend = "sequential"  # activate sequential backend for debugging in VSCode
 
 if __name__ == "__main__":  # necessary for dask backend
 

From bacade9f61c56854a639e91b429f7a7f4270f562 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Thu, 10 Jul 2025 13:50:39 -0400
Subject: [PATCH 3/9] some cleanup

---
 main_workarena_debug.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/main_workarena_debug.py b/main_workarena_debug.py
index 036a2ad4..2042bbd8 100644
--- a/main_workarena_debug.py
+++ b/main_workarena_debug.py
@@ -9,7 +9,6 @@
 import logging
 
 import bgym
-from bgym import DEFAULT_BENCHMARKS
 
 from agentlab.agents.tool_use_agent.tool_use_agent import (
     DEFAULT_PROMPT_CONFIG,
@@ -22,8 +21,8 @@
 logging.getLogger().setLevel(logging.INFO)
 
 agent_config = ToolUseAgentArgs(
-    model_args=OPENAI_MODEL_CONFIG,
-    config=GPT_4_1_MINI,
+    model_args=GPT_4_1_MINI,
+    config=DEFAULT_PROMPT_CONFIG,
 )
 
 
@@ -37,7 +36,7 @@
 benchmark = "workarena_l1"
 
 
-benchmark = DEFAULT_BENCHMARKS[benchmark](n_seeds=1)  # type: bgym.Benchmark
+benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]()  # type: bgym.Benchmark
 benchmark = benchmark.subset_from_glob("task_name", "*create*")
 
 # for env_args in benchmark.env_args_list:

From 16b525115727d78b8d56c2124a54b44bc689a52b Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 11 Jul 2025 13:52:16 -0400
Subject: [PATCH 4/9] multi_action_hint

---
 .../agents/tool_use_agent/tool_use_agent.py      | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
index 28b97e82..3d506bdb 100644
--- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py
+++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -127,8 +127,10 @@ class Goal(Block):
 
     goal_as_system_msg: bool = True
 
-    def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
-        system_message = llm.msg.system().add_text(SYS_MSG)
+    def apply(
+        self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG
+    ) -> dict:
+        system_message = llm.msg.system().add_text(sys_msg)
         discussion.append(system_message)
 
         if self.goal_as_system_msg:
@@ -441,7 +443,13 @@ def get_action(self, obs: Any) -> float:
         self.llm.reset_stats()
         if not self.discussion.is_goal_set():
             self.discussion.new_group("goal")
-            self.config.goal.apply(self.llm, self.discussion, obs)
+
+            if self.config.multiaction:
+                sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed."
+            else:
+                sys_msg = SYS_MSG + "\nYou can only take one action at a time."
+            self.config.goal.apply(self.llm, self.discussion, obs, sys_msg)
+
             self.config.summarizer.apply_init(self.llm, self.discussion)
             self.config.general_hints.apply(self.llm, self.discussion)
             self.task_hint.apply(self.llm, self.discussion, self.task_name)
@@ -489,7 +497,7 @@ def get_action(self, obs: Any) -> float:
         return action, agent_info
 
 
-OPENAI_MODEL_CONFIG = OpenAIResponseModelArgs(
+GPT_4_1 = OpenAIResponseModelArgs(
     model_name="gpt-4.1",
     max_total_tokens=200_000,
     max_input_tokens=200_000,

From 0fc17328c48f9d6a276914ce612e87d9dd9332ce Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 11 Jul 2025 13:53:19 -0400
Subject: [PATCH 5/9] adding dash_line to action overlay

---
 src/agentlab/analyze/overlay_utils.py | 43 +++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py
index 4649a962..51ff61c3 100644
--- a/src/agentlab/analyze/overlay_utils.py
+++ b/src/agentlab/analyze/overlay_utils.py
@@ -1,9 +1,11 @@
 import ast
 import inspect
+import math
 from dataclasses import dataclass
 from typing import Any, Union
 
 import matplotlib.pyplot as plt
+import PIL
 from browsergym.core.action.highlevel import ACTION_SUBSETS
 from PIL import Image, ImageDraw
 
@@ -289,17 +291,54 @@ def overlay_rectangle(
     bbox: tuple[float, float, float, float],
     color: Union[str, tuple[int, int, int]] = "red",
     width: int = 1,
+    dashed: bool = True,
 ) -> Image.Image:
     draw = ImageDraw.Draw(img)
 
     x, y, w, h = bbox
 
-    # Draw rectangle outline
-    draw.rectangle([x, y, x + w, y + h], outline=color, width=width)
+    if dashed:
+        # Draw dashed rectangle
+        print("Drawing dashed rectangle")
+        linedashed(draw, x, y, x + w, y, color, width)
+        linedashed(draw, x + w, y, x + w, y + h, color, width)
+        linedashed(draw, x + w, y + h, x, y + h, color, width)
+        linedashed(draw, x, y + h, x, y, color, width)
+    else:
+        draw.rectangle([x, y, x + w, y + h], outline=color, width=width)
 
     return img
 
 
+# Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306
+def linedashed(
+    draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dash_length=4, nodash_length=8
+):
+    line_dx = x1 - x0  # delta x (can be negative)
+    line_dy = y1 - y0  # delta y (can be negative)
+    line_length = math.hypot(line_dx, line_dy)  # line length (positive)
+    if line_length == 0:
+        return  # Avoid division by zero in case the line length is 0
+    pixel_dx = line_dx / line_length  # x add for 1px line length
+    pixel_dy = line_dy / line_length  # y add for 1px line length
+    dash_start = 0
+    while dash_start < line_length:
+        dash_end = dash_start + dash_length
+        if dash_end > line_length:
+            dash_end = line_length
+        draw.line(
+            (
+                round(x0 + pixel_dx * dash_start),
+                round(y0 + pixel_dy * dash_start),
+                round(x0 + pixel_dx * dash_end),
+                round(y0 + pixel_dy * dash_end),
+            ),
+            fill=fill,
+            width=width,
+        )
+        dash_start += dash_length + nodash_length
+
+
 def annotate_action(
     img: Image.Image, action_string: str, properties: dict[str, tuple], colormap: str = "tab10"
 ) -> str:

From 1f76e35b8ff6796906a22af23bfdd3738804bcc9 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 11 Jul 2025 13:54:45 -0400
Subject: [PATCH 6/9] minor convenient updates

---
 _sandbox                | 35 +++++++++++++++++++++++++++++++++++
 main_workarena_debug.py | 29 +++++++++++++++++++----------
 2 files changed, 54 insertions(+), 10 deletions(-)
 create mode 100644 _sandbox

diff --git a/_sandbox b/_sandbox
new file mode 100644
index 00000000..6036d436
--- /dev/null
+++ b/_sandbox
@@ -0,0 +1,35 @@
+# import gradio as gr
+# import pandas as pd
+
+# df = pd.DataFrame({"A": [14, 4, 5, 4, 1], "B": [5, 2, 54, 3, 2], "C": [20, 20, 7, 3, 8]})
+
+
+# # Highlight entire rows based on conditions
+# def highlight_rows(row):
+#     if row["A"] > 4:
+#         return ["background-color: darkblue"] * len(row)
+#     else:
+#         return [""] * len(row)
+
+
+# styler = df.style.apply(highlight_rows, axis=1)
+
+# with gr.Blocks() as demo:
+#     gr.Dataframe(styler)
+# demo.launch()
+
+
+def format_function_call(func_name, kwargs=None):
+    """Format a function name and kwargs dict into a Python function call string."""
+    if kwargs is None:
+        kwargs = {}
+
+    if not kwargs:
+        return f"{func_name}()"
+
+    args_str = ", ".join(f"{key}={repr(value)}" for key, value in kwargs.items())
+    return f"{func_name}({args_str})"
+
+
+print(format_function_call("my_function", {"arg1": 42, "arg2": """Marc's 17" laptop"""}))
+print(format_function_call("my_function", {"arg1": 42, "arg2": "17' pole"}))
diff --git a/main_workarena_debug.py b/main_workarena_debug.py
index 2042bbd8..53077ac4 100644
--- a/main_workarena_debug.py
+++ b/main_workarena_debug.py
@@ -7,28 +7,37 @@
 """
 
 import logging
+from copy import deepcopy
 
 import bgym
 
 from agentlab.agents.tool_use_agent.tool_use_agent import (
     DEFAULT_PROMPT_CONFIG,
-    GPT_4_1_MINI,
-    OPENAI_MODEL_CONFIG,
+    GPT_4_1,
     ToolUseAgentArgs,
 )
 from agentlab.experiments.study import Study
 
 logging.getLogger().setLevel(logging.INFO)
 
-agent_config = ToolUseAgentArgs(
-    model_args=GPT_4_1_MINI,
-    config=DEFAULT_PROMPT_CONFIG,
-)
+config = deepcopy(DEFAULT_PROMPT_CONFIG)
+# config.keep_last_n_obs = 1
+config.obs.use_som = True
 
 
-agent_config.config.action_subsets = ("workarena",)  # use the workarena action set
+agent_configs = [
+    ToolUseAgentArgs(
+        model_args=GPT_4_1,
+        config=config,
+    ),
+    # ToolUseAgentArgs(
+    #     model_args=GPT_4_1,
+    #     config=config,
+    # ),
+]
 
-agent_args = [agent_config]
+for agent_config in agent_configs:
+    agent_config.config.action_subsets = ("workarena",)  # use the workarena action set
 
 
 # ## select the benchmark to run on
@@ -36,7 +45,7 @@
 benchmark = "workarena_l1"
 
 
-benchmark = bgym.DEFAULT_BENCHMARKS[benchmark]()  # type: bgym.Benchmark
+benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4)  # type: bgym.Benchmark
 benchmark = benchmark.subset_from_glob("task_name", "*create*")
 
 # for env_args in benchmark.env_args_list:
@@ -58,7 +67,7 @@
         study.find_incomplete(include_errors=True)
 
     else:
-        study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING)
+        study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING)
 
     study.run(
         n_jobs=n_jobs,

From c4d3c4a3e99511c7d70ece9193a776321f9f17e4 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 11 Jul 2025 16:40:05 -0400
Subject: [PATCH 7/9] Fix openai returning no tool calls

Refactor OpenAIResponseModel and related classes to remove tool_choice attribute and streamline API calls
---
 src/agentlab/llm/response_api.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
index 06f346a2..8e6f3695 100644
--- a/src/agentlab/llm/response_api.py
+++ b/src/agentlab/llm/response_api.py
@@ -313,7 +313,6 @@ def __init__(
         **kwargs,
     ):
         self.tools = kwargs.pop("tools", None)
-        self.tool_choice = kwargs.pop("tool_choice", None)
         super().__init__(
             model_name=model_name,
             api_key=api_key,
@@ -324,7 +323,9 @@ def __init__(
         )
         self.client = OpenAI(api_key=api_key)
 
-    def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
+    def _call_api(
+        self, messages: list[Any | MessageBuilder], tool_choice: str = "auto", **kwargs
+    ) -> dict:
         input = []
         for msg in messages:
             input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
@@ -339,8 +340,10 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
 
         if self.tools is not None:
             api_params["tools"] = self.tools
-        if self.tool_choice is not None:
-            api_params["tool_choice"] = self.tool_choice
+        if tool_choice in ("any", "required"):
+            tool_choice = "required"
+
+        api_params["tool_choice"] = tool_choice
 
         # api_params |= kwargs  # Merge any additional parameters passed
         response = call_openai_api_with_retries(
@@ -388,7 +391,6 @@ def __init__(
     ):
 
         self.tools = self.format_tools_for_chat_completion(kwargs.pop("tools", None))
-        self.tool_choice = kwargs.pop("tool_choice", None)
 
         super().__init__(
             model_name=model_name,
@@ -403,7 +405,9 @@ def __init__(
             **client_args
         )  # Ensures client_args is a dict or defaults to an empty dict
 
-    def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.ChatCompletion:
+    def _call_api(
+        self, messages: list[dict | MessageBuilder], tool_choice: str = "auto"
+    ) -> openai.types.chat.ChatCompletion:
         input = []
         for msg in messages:
             input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
@@ -416,8 +420,10 @@ def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.
         }
         if self.tools is not None:
             api_params["tools"] = self.tools
-        if self.tool_choice is not None:
-            api_params["tool_choice"] = self.tool_choice
+
+        if tool_choice in ("any", "required"):
+            tool_choice = "required"
+        api_params["tool_choice"] = tool_choice
 
         response = call_openai_api_with_retries(self.client.chat.completions.create, api_params)
 
@@ -517,7 +523,6 @@ def __init__(
         **kwargs,
     ):
         self.tools = kwargs.pop("tools", None)
-        self.tool_choice = kwargs.pop("tool_choice", None)
 
         super().__init__(
             model_name=model_name,
@@ -543,6 +548,9 @@ def _call_api(
                 temp = self.apply_cache_breakpoints(msg, temp)
             input.extend(temp)
 
+        if tool_choice in ("any", "required"):
+            tool_choice = "any"  # Claude API expects "any" and gpt expects "required"
+
         api_params: Dict[str, Any] = {
             "model": self.model_name,
             "messages": input,

From 73c64ce351880b6710cb9805bc353c814f1afff8 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 11 Jul 2025 16:40:55 -0400
Subject: [PATCH 8/9] minor for loading weird studies

---
 src/agentlab/analyze/agent_xray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index ea5371b9..bd1c6ad4 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -1164,7 +1164,7 @@ def get_directory_contents(results_dir: Path):
                 most_recent_summary = max(summary_files, key=os.path.getctime)
                 summary_df = pd.read_csv(most_recent_summary)
 
-                if len(summary_df) == 0 or summary_df["avg_reward"].isna().all():
+                if len(summary_df) == 0:
                     continue  # skip if all avg_reward are NaN
 
                 # get row with max avg_reward

From 6a4c808b164fcc839e918d7fa2ac5530c1bc90de Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 11 Jul 2025 16:43:09 -0400
Subject: [PATCH 9/9] removing something that should not be ther

---
 _sandbox | 35 -----------------------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 _sandbox

diff --git a/_sandbox b/_sandbox
deleted file mode 100644
index 6036d436..00000000
--- a/_sandbox
+++ /dev/null
@@ -1,35 +0,0 @@
-# import gradio as gr
-# import pandas as pd
-
-# df = pd.DataFrame({"A": [14, 4, 5, 4, 1], "B": [5, 2, 54, 3, 2], "C": [20, 20, 7, 3, 8]})
-
-
-# # Highlight entire rows based on conditions
-# def highlight_rows(row):
-#     if row["A"] > 4:
-#         return ["background-color: darkblue"] * len(row)
-#     else:
-#         return [""] * len(row)
-
-
-# styler = df.style.apply(highlight_rows, axis=1)
-
-# with gr.Blocks() as demo:
-#     gr.Dataframe(styler)
-# demo.launch()
-
-
-def format_function_call(func_name, kwargs=None):
-    """Format a function name and kwargs dict into a Python function call string."""
-    if kwargs is None:
-        kwargs = {}
-
-    if not kwargs:
-        return f"{func_name}()"
-
-    args_str = ", ".join(f"{key}={repr(value)}" for key, value in kwargs.items())
-    return f"{func_name}({args_str})"
-
-
-print(format_function_call("my_function", {"arg1": 42, "arg2": """Marc's 17" laptop"""}))
-print(format_function_call("my_function", {"arg1": 42, "arg2": "17' pole"}))