-
Notifications
You must be signed in to change notification settings - Fork 276
Expand file tree
/
Copy pathsimple_multi_gpu_example.py
More file actions
148 lines (122 loc) · 4.67 KB
/
simple_multi_gpu_example.py
File metadata and controls
148 lines (122 loc) · 4.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
# ################################################################################
#
# This example demonstrates how to use cuda.core to compile and launch
# kernels on multiple GPUs. Requires at least 2 GPUs.
#
# ################################################################################
# /// script
# dependencies = ["cuda_bindings", "cuda_core", "cupy-cuda13x"]
# ///
import sys
import cupy as cp
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
dtype = cp.float32
size = 50000
# This adaptor ensures that any foreign stream (ex: from CuPy) that has not yet
# supported the __cuda_stream__ protocol can still be recognized by cuda.core.
class StreamAdaptor:
def __init__(self, obj):
self.obj = obj
def __cuda_stream__(self):
# Note: CuPy streams have a .ptr attribute
return (0, self.obj.ptr)
def main():
if system.get_num_devices() < 2:
print("this example requires at least 2 GPUs", file=sys.stderr)
sys.exit(1)
# Set GPU 0
dev0 = Device(0)
dev0.set_current()
stream0 = dev0.create_stream()
stream1 = None
cp_stream0 = None
cp_stream1 = None
try:
# Compile a kernel targeting GPU 0 to compute c = a + b
code_add = """
extern "C"
__global__ void vector_add(const float* A,
const float* B,
float* C,
size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
C[i] = A[i] + B[i];
}
}
"""
prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
mod_add = prog_add.compile("cubin")
add_kernel = mod_add.get_kernel("vector_add")
# Set GPU 1
dev1 = Device(1)
dev1.set_current()
stream1 = dev1.create_stream()
# Compile a kernel targeting GPU 1 to compute c = a - b
code_sub = """
extern "C"
__global__ void vector_sub(const float* A,
const float* B,
float* C,
size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
C[tid] = A[tid] - B[tid];
}
}
"""
prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
mod_sub = prog_sub.compile("cubin")
sub_kernel = mod_sub.get_kernel("vector_sub")
# Create launch configs for each kernel that will be executed on the respective
# CUDA streams.
block = 256
grid = (size + block - 1) // block
config0 = LaunchConfig(grid=grid, block=block)
config1 = LaunchConfig(grid=grid, block=block)
# Allocate memory on GPU 0
# Note: This runs on CuPy's current stream for GPU 0
dev0.set_current()
rng = cp.random.default_rng()
a = rng.random(size, dtype=dtype)
b = rng.random(size, dtype=dtype)
c = cp.empty_like(a)
cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
# Establish a stream order to ensure that memory has been initialized before
# accessed by the kernel.
stream0.wait(cp_stream0)
# Launch the add kernel on GPU 0 / stream 0
launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
# Allocate memory on GPU 1
# Note: This runs on CuPy's current stream for GPU 1.
dev1.set_current()
rng = cp.random.default_rng()
x = rng.random(size, dtype=dtype)
y = rng.random(size, dtype=dtype)
z = cp.empty_like(a)
cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
# Establish a stream order
stream1.wait(cp_stream1)
# Launch the subtract kernel on GPU 1 / stream 1
launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
# Synchronize both GPUs and validate the results
dev0.set_current()
stream0.sync()
assert cp.allclose(c, a + b)
dev1.set_current()
stream1.sync()
assert cp.allclose(z, x - y)
print("done")
finally:
if cp_stream1 is not None:
cp_stream1.close()
if cp_stream0 is not None:
cp_stream0.close()
if stream1 is not None:
stream1.close()
stream0.close()
if __name__ == "__main__":
main()