Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ jobs:
# platforms: linux/amd64
# runner: [self-hosted, linux/amd64, rocm]
# build_args: "NUM_MAKE_JOBS=16"
- name: rocm6.3
dockerfile: rocm6.3.x
tags: superbench/main:rocm6.3
platforms: linux/amd64
runner: [self-hosted, linux/amd64, rocm]
build_args: "NUM_MAKE_JOBS=16"
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
147 changes: 147 additions & 0 deletions dockerfile/rocm6.3.x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
ARG BASE_IMAGE=rocm/pytorch-training:v25.6

FROM ${BASE_IMAGE}

# Base image: rocm/pytorch-training:v25.6
# Pre-installed by base image:
# - Ubuntu: 22.04
# - Python: 3.10
# - ROCm: 6.3.4
# - openmpi: 4.0.7rc2
# - torch: 2.8.0a0+git7d205b2
# - rccl: 2.21.5.60304-76
# - hipblaslt: 0.15.0-8c69191d
# - transformer_engine: 1.14.0+2f85f5f2
# - flash_attention: 3.0.0.post1
# - cmake: 3.18.5
# - rocm-cmake: 0.14.0.60304-76
# - amd-smi: 25.1.0+8dc45db
# Added by this Dockerfile:
# - Docker Client: 27.5.1
# - mlc: v3.12
# - OFED: 24.10-1.1.4.0 LTS

# Fix base image botocore/urllib3 incompatibility:
# Base image ships botocore 1.22.12 (expects urllib3 1.x) with urllib3 2.6.3,
# causing "cannot import name 'DEFAULT_CIPHERS' from 'urllib3.util.ssl_'".
# Upgrading botocore/boto3 to versions compatible with urllib3 2.x.
RUN python3 -m pip install --upgrade botocore boto3
Comment on lines +27 to +28

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
python3-mpi4py \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*

ARG NUM_MAKE_JOBS=64

Comment on lines +69 to +70
# Install Docker
ENV DOCKER_VERSION=27.5.1
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
Comment on lines +82 to +86


# Get Ubuntu version and set as an environment variable
RUN echo "Ubuntu version: $(lsb_release -r -s)"
ARG UBUNTU_VERSION=22.04

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
# Check if ofed_info is present and has a version
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
Comment on lines +89 to +97
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
Comment on lines +94 to +99
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi

ENV ROCM_PATH=/opt/rocm

# Target GPU architectures for ROCm builds (space-separated)
ENV AMDGPU_TARGETS="gfx908 gfx90a gfx942"

# Use pre-installed OpenMPI from base image at /opt/ompi
ENV MPI_HOME=/opt/ompi

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

ENV PATH="/opt/ompi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_LIBRARY_PATH="/opt/ompi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

RUN python3 -m pip install --upgrade pip wheel setuptools==65.7 && \
python3 -c "import pkg_resources" || python3 -m pip install setuptools
Comment on lines +131 to +132

Comment on lines +131 to +133
WORKDIR ${SB_HOME}

ADD third_party third_party
# perftest_rocm6.patch skipped — upstream perftest already includes the equivalent changes
RUN make RCCL_HOME=/opt/rocm ROCBLAS_BRANCH=release-staging/rocm-rel-6.3 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.3 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_megatron_lm
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch

ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install --no-build-isolation .[amdworker] && \
CXX=/opt/rocm/bin/hipcc make cppbuild && \
make postinstall
Comment on lines +142 to +147
9 changes: 9 additions & 0 deletions third_party/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,17 @@ ifneq (,$(wildcard fio/Makefile))
endif

# Build rccl-tests from commit 46375b1 of default branch.
# If AMDGPU_TARGETS env var is set (space-separated, e.g. "gfx908 gfx90a gfx942"),
# explicit --offload-arch flags and include paths are used.
# Otherwise, the original build command is used (relies on hipcc auto-detection).
ROCM_OFFLOAD_ARCH_FLAGS := $(foreach arch,$(AMDGPU_TARGETS),--offload-arch=$(arch))
rocm_rccl_tests: sb_micro_path
ifneq (, $(wildcard rccl-tests/Makefile))
ifdef AMDGPU_TARGETS
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) HIPCUFLAGS="-std=c++14 -O3 $(ROCM_OFFLOAD_ARCH_FLAGS) -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/rccl -I$(ROCM_PATH)/include/hip -DMPI_SUPPORT -I$(MPI_HOME)/include -I$(MPI_HOME)/include/mpi" -j
else
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
endif
cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif

Expand Down Expand Up @@ -168,6 +176,7 @@ rocm_hipblaslt: sb_micro_path
# Build hipBusBandwidth.
# HIP is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.

rocm_bandwidthTest: sb_micro_path
git clone -b ${ROCM_VER} https://github.com/ROCm-Developer-Tools/HIP.git
cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
Expand Down
Loading