diff --git a/vec_inf/client/_slurm_templates.py b/vec_inf/client/_slurm_templates.py index 52d5998..1a7bf49 100644 --- a/vec_inf/client/_slurm_templates.py +++ b/vec_inf/client/_slurm_templates.py @@ -186,10 +186,15 @@ class SlurmScriptTemplate(TypedDict): ], "write_to_json": [ '\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"', - 'jq --arg server_addr "$server_address" \\', - " '. + {{\"server_address\": $server_addr}}' \\", - ' "$json_path" > temp.json \\', - ' && mv temp.json "$json_path"', + 'tmp_json="${{json_path}}.tmp.$$"', + "for _attempt in 1 2 3 4 5; do", + ' jq --arg server_addr "$server_address" \\', + " '. + {{\"server_address\": $server_addr}}' \\", + ' "$json_path" > "$tmp_json" \\', + ' && mv "$tmp_json" "$json_path" \\', + " && break", + " sleep 2", + "done", ], "launch_cmd": { "vllm": [ @@ -303,10 +308,15 @@ class BatchModelLaunchScriptTemplate(TypedDict): "write_to_json": [ "het_job_id=$(($SLURM_JOB_ID+{het_group_id}))", 'json_path="{log_dir}/{slurm_job_name}.$het_job_id/{model_name}.$het_job_id.json"', - 'jq --arg server_addr "$server_address" \\', - " '. + {{\"server_address\": $server_addr}}' \\", - ' "$json_path" > temp_{model_name}.json \\', - ' && mv temp_{model_name}.json "$json_path"\n', + 'tmp_json="${{json_path}}.tmp.$$"', + "for _attempt in 1 2 3 4 5; do", + ' jq --arg server_addr "$server_address" \\', + " '. + {{\"server_address\": $server_addr}}' \\", + ' "$json_path" > "$tmp_json" \\', + ' && mv "$tmp_json" "$json_path" \\', + " && break", + " sleep 2", + "done\n", ], "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {{image_path}} \\", "launch_cmd": {