Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions main_workarena_debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Note: This script is a convenience script to launch experiments instead of using
the command line.

Copy this script and modify at will, but don't push your changes to the
repository.
"""

import logging
from copy import deepcopy

import bgym

from agentlab.agents.tool_use_agent.tool_use_agent import (
DEFAULT_PROMPT_CONFIG,
GPT_4_1,
ToolUseAgentArgs,
)
from agentlab.experiments.study import Study

logging.getLogger().setLevel(logging.INFO)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basic root logger configuration category Logging

Tell me more
What is the issue?

Using root logger without a proper logger configuration (no handler, formatter or name)

Why this matters

Without proper logger configuration, logs may lack contextual information like timestamps and source, making debugging production issues difficult.

Suggested change ∙ Feature Preview
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


config = deepcopy(DEFAULT_PROMPT_CONFIG)
# config.keep_last_n_obs = 1
config.obs.use_som = True


agent_configs = [
ToolUseAgentArgs(
model_args=GPT_4_1,
config=config,
),
# ToolUseAgentArgs(
# model_args=GPT_4_1,
# config=config,
# ),
]

for agent_config in agent_configs:
agent_config.config.action_subsets = ("workarena",) # use the workarena action set
Comment on lines +28 to +40
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agent Configuration Should Be Separated category Design

Tell me more
What is the issue?

Agent configuration is mixed with experiment setup code, violating the Single Responsibility Principle.

Why this matters

Mixing configuration with setup code makes it harder to maintain different agent configurations and reduces code reusability across different experiments.

Suggested change ∙ Feature Preview

Extract agent configuration into a separate factory or builder class:

class AgentConfigFactory:
    @staticmethod
    def create_workarena_config(base_config):
        config = deepcopy(base_config)
        config.action_subsets = ("workarena",)
        return ToolUseAgentArgs(model_args=GPT_4_1, config=config)

agent_configs = [AgentConfigFactory.create_workarena_config(DEFAULT_PROMPT_CONFIG)]
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.



# ## select the benchmark to run on
# benchmark = "miniwob_tiny_test"
benchmark = "workarena_l1"


benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4) # type: bgym.Benchmark
benchmark = benchmark.subset_from_glob("task_name", "*create*")

# for env_args in benchmark.env_args_list:
# print(env_args.task_name)
# env_args.max_steps = 15

relaunch = False

## Number of parallel jobs
n_jobs = 10 # Make sure to use 1 job when debugging in VSCode
parallel_backend = "ray"
Comment on lines +58 to +59
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Static Parallel Job Configuration category Performance

Tell me more
What is the issue?

Hard-coded number of parallel jobs without considering system resources (CPU cores, memory) could lead to suboptimal performance.

Why this matters

Setting a fixed number of jobs might either underutilize available resources or overload the system, causing overhead from context switching and memory pressure.

Suggested change ∙ Feature Preview

Use system information to determine optimal job count:

from multiprocessing import cpu_count
n_jobs = min(cpu_count(), 10)  # Cap at 10 but adjust based on available cores
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

# parallel_backend = "sequential" # activate sequential backend for debugging in VSCode

if __name__ == "__main__": # necessary for dask backend

if relaunch:
# relaunch an existing study
study = Study.load_most_recent(contains=None)
study.find_incomplete(include_errors=True)

else:
study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING)

study.run(
n_jobs=n_jobs,
parallel_backend=parallel_backend, # "ray", "joblib" or "sequential"
strict_reproducibility=False,
n_relaunch=3,
)
16 changes: 12 additions & 4 deletions src/agentlab/agents/tool_use_agent/tool_use_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,10 @@ class Goal(Block):

goal_as_system_msg: bool = True

def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
system_message = llm.msg.system().add_text(SYS_MSG)
def apply(
self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG
) -> dict:
system_message = llm.msg.system().add_text(sys_msg)
discussion.append(system_message)

if self.goal_as_system_msg:
Expand Down Expand Up @@ -441,7 +443,13 @@ def get_action(self, obs: Any) -> float:
self.llm.reset_stats()
if not self.discussion.is_goal_set():
self.discussion.new_group("goal")
self.config.goal.apply(self.llm, self.discussion, obs)

if self.config.multiaction:
sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed."
else:
sys_msg = SYS_MSG + "\nYou can only take one action at a time."
self.config.goal.apply(self.llm, self.discussion, obs, sys_msg)

self.config.summarizer.apply_init(self.llm, self.discussion)
self.config.general_hints.apply(self.llm, self.discussion)
self.task_hint.apply(self.llm, self.discussion, self.task_name)
Expand Down Expand Up @@ -489,7 +497,7 @@ def get_action(self, obs: Any) -> float:
return action, agent_info


OPENAI_MODEL_CONFIG = OpenAIResponseModelArgs(
GPT_4_1 = OpenAIResponseModelArgs(
model_name="gpt-4.1",
max_total_tokens=200_000,
max_input_tokens=200_000,
Expand Down
2 changes: 1 addition & 1 deletion src/agentlab/analyze/agent_xray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,7 @@ def get_directory_contents(results_dir: Path):
most_recent_summary = max(summary_files, key=os.path.getctime)
summary_df = pd.read_csv(most_recent_summary)

if len(summary_df) == 0 or summary_df["avg_reward"].isna().all():
if len(summary_df) == 0:
continue # skip if all avg_reward are NaN

# get row with max avg_reward
Expand Down
43 changes: 41 additions & 2 deletions src/agentlab/analyze/overlay_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import ast
import inspect
import math
from dataclasses import dataclass
from typing import Any, Union

import matplotlib.pyplot as plt
import PIL
from browsergym.core.action.highlevel import ACTION_SUBSETS
from PIL import Image, ImageDraw

Expand Down Expand Up @@ -289,17 +291,54 @@ def overlay_rectangle(
bbox: tuple[float, float, float, float],
color: Union[str, tuple[int, int, int]] = "red",
width: int = 1,
dashed: bool = True,
) -> Image.Image:
draw = ImageDraw.Draw(img)

x, y, w, h = bbox

# Draw rectangle outline
draw.rectangle([x, y, x + w, y + h], outline=color, width=width)
if dashed:
# Draw dashed rectangle
print("Drawing dashed rectangle")
linedashed(draw, x, y, x + w, y, color, width)
linedashed(draw, x + w, y, x + w, y + h, color, width)
linedashed(draw, x + w, y + h, x, y + h, color, width)
linedashed(draw, x, y + h, x, y, color, width)
else:
draw.rectangle([x, y, x + w, y + h], outline=color, width=width)

return img


# Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306
def linedashed(
draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dash_length=4, nodash_length=8
):
line_dx = x1 - x0 # delta x (can be negative)
line_dy = y1 - y0 # delta y (can be negative)
line_length = math.hypot(line_dx, line_dy) # line length (positive)
if line_length == 0:
return # Avoid division by zero in case the line length is 0
pixel_dx = line_dx / line_length # x add for 1px line length
pixel_dy = line_dy / line_length # y add for 1px line length
dash_start = 0
while dash_start < line_length:
dash_end = dash_start + dash_length
if dash_end > line_length:
dash_end = line_length
draw.line(
(
round(x0 + pixel_dx * dash_start),
round(y0 + pixel_dy * dash_start),
round(x0 + pixel_dx * dash_end),
round(y0 + pixel_dy * dash_end),
),
fill=fill,
width=width,
)
dash_start += dash_length + nodash_length


def annotate_action(
img: Image.Image, action_string: str, properties: dict[str, tuple], colormap: str = "tab10"
) -> str:
Expand Down
26 changes: 17 additions & 9 deletions src/agentlab/llm/response_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,6 @@ def __init__(
**kwargs,
):
self.tools = kwargs.pop("tools", None)
self.tool_choice = kwargs.pop("tool_choice", None)
super().__init__(
model_name=model_name,
api_key=api_key,
Expand All @@ -324,7 +323,9 @@ def __init__(
)
self.client = OpenAI(api_key=api_key)

def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
def _call_api(
self, messages: list[Any | MessageBuilder], tool_choice: str = "auto", **kwargs
) -> dict:
input = []
for msg in messages:
input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
Expand All @@ -339,8 +340,10 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:

if self.tools is not None:
api_params["tools"] = self.tools
if self.tool_choice is not None:
api_params["tool_choice"] = self.tool_choice
if tool_choice in ("any", "required"):
tool_choice = "required"

api_params["tool_choice"] = tool_choice

# api_params |= kwargs # Merge any additional parameters passed
response = call_openai_api_with_retries(
Expand Down Expand Up @@ -388,7 +391,6 @@ def __init__(
):

self.tools = self.format_tools_for_chat_completion(kwargs.pop("tools", None))
self.tool_choice = kwargs.pop("tool_choice", None)

super().__init__(
model_name=model_name,
Expand All @@ -403,7 +405,9 @@ def __init__(
**client_args
) # Ensures client_args is a dict or defaults to an empty dict

def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.ChatCompletion:
def _call_api(
self, messages: list[dict | MessageBuilder], tool_choice: str = "auto"
) -> openai.types.chat.ChatCompletion:
input = []
for msg in messages:
input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
Expand All @@ -416,8 +420,10 @@ def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.
}
if self.tools is not None:
api_params["tools"] = self.tools
if self.tool_choice is not None:
api_params["tool_choice"] = self.tool_choice

if tool_choice in ("any", "required"):
tool_choice = "required"
api_params["tool_choice"] = tool_choice

response = call_openai_api_with_retries(self.client.chat.completions.create, api_params)

Expand Down Expand Up @@ -517,7 +523,6 @@ def __init__(
**kwargs,
):
self.tools = kwargs.pop("tools", None)
self.tool_choice = kwargs.pop("tool_choice", None)

super().__init__(
model_name=model_name,
Expand All @@ -543,6 +548,9 @@ def _call_api(
temp = self.apply_cache_breakpoints(msg, temp)
input.extend(temp)

if tool_choice in ("any", "required"):
tool_choice = "any" # Claude API expects "any" and gpt expects "required"

api_params: Dict[str, Any] = {
"model": self.model_name,
"messages": input,
Expand Down