diff --git a/experiments/run_osworld.py b/experiments/run_osworld.py index 36db0878..c75c7e57 100644 --- a/experiments/run_osworld.py +++ b/experiments/run_osworld.py @@ -28,7 +28,7 @@ def get_task_ids() -> set[str]: def main(): n_jobs = 4 use_vmware = True - relaunch = True + relaunch = False agent_args = [ OSWORLD_CLAUDE, # OSWORLD_OAI # performs poorly. diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index d39bdcc0..b7494693 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -7,8 +7,8 @@ from typing import Any import bgym -import numpy as np import pandas as pd +from bgym import Benchmark as BgymBenchmark from browsergym.core.observation import extract_screenshot from browsergym.utils.obs import ( flatten_axtree_to_str, @@ -16,12 +16,9 @@ overlay_som, prune_html, ) -from PIL import Image -from agentlab.agents import agent_utils -from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark -from bgym import Benchmark as BgymBenchmark from agentlab.agents.agent_args import AgentArgs +from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark from agentlab.benchmarks.osworld import OSWorldActionSet from agentlab.llm.base_api import BaseModelArgs from agentlab.llm.llm_utils import image_to_png_base64_url @@ -629,7 +626,7 @@ def get_action(self, obs: Any) -> float: ) OSWORLD_OAI = ToolUseAgentArgs( - model_args=OPENAI_MODEL_CONFIG, + model_args=GPT_4_1_MINI, config=PromptConfig( tag_screenshot=True, goal=Goal(goal_as_system_msg=True),