cuda-python/cuda_core/examples/simple_multi_gpu_example.py at f03471d822914a6e84d7ac6a27125e7d1f6c8c19 · NVIDIA/cuda-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

# ################################################################################
#
# This example demonstrates how to use cuda.core to compile and launch
# kernels on multiple GPUs. Requires at least 2 GPUs.
#
# ################################################################################

# /// script
# dependencies = ["cuda_bindings", "cuda_core", "cupy-cuda13x"]
# ///

import sys

import cupy as cp

from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system

dtype = cp.float32
size = 50000


# This adaptor ensures that any foreign stream (ex: from CuPy) that has not yet
# supported the __cuda_stream__ protocol can still be recognized by cuda.core.
class StreamAdaptor:
    def __init__(self, obj):
        self.obj = obj

    def __cuda_stream__(self):
        # Note: CuPy streams have a .ptr attribute
        return (0, self.obj.ptr)


def main():
    if system.get_num_devices() < 2:
        print("this example requires at least 2 GPUs", file=sys.stderr)
        sys.exit(1)

    # Set GPU 0
    dev0 = Device(0)
    dev0.set_current()
    stream0 = dev0.create_stream()
    stream1 = None
    cp_stream0 = None
    cp_stream1 = None

    try:
        # Compile a kernel targeting GPU 0 to compute c = a + b
        code_add = """
extern "C"
__global__ void vector_add(const float* A,
                           const float* B,
                           float* C,
                           size_t N) {
    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
        C[i] = A[i] + B[i];
    }
}
"""
        prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
        mod_add = prog_add.compile("cubin")
        add_kernel = mod_add.get_kernel("vector_add")

        # Set GPU 1
        dev1 = Device(1)
        dev1.set_current()
        stream1 = dev1.create_stream()

        # Compile a kernel targeting GPU 1 to compute c = a - b
        code_sub = """
extern "C"
__global__ void vector_sub(const float* A,
                           const float* B,
                           float* C,
                           size_t N) {
    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
        C[tid] = A[tid] - B[tid];
    }
}
"""
        prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
        mod_sub = prog_sub.compile("cubin")
        sub_kernel = mod_sub.get_kernel("vector_sub")

        # Create launch configs for each kernel that will be executed on the respective
        # CUDA streams.
        block = 256
        grid = (size + block - 1) // block
        config0 = LaunchConfig(grid=grid, block=block)
        config1 = LaunchConfig(grid=grid, block=block)

        # Allocate memory on GPU 0
        # Note: This runs on CuPy's current stream for GPU 0
        dev0.set_current()
        rng = cp.random.default_rng()
        a = rng.random(size, dtype=dtype)
        b = rng.random(size, dtype=dtype)
        c = cp.empty_like(a)
        cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))

        # Establish a stream order to ensure that memory has been initialized before
        # accessed by the kernel.
        stream0.wait(cp_stream0)

        # Launch the add kernel on GPU 0 / stream 0
        launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))

        # Allocate memory on GPU 1
        # Note: This runs on CuPy's current stream for GPU 1.
        dev1.set_current()
        rng = cp.random.default_rng()
        x = rng.random(size, dtype=dtype)
        y = rng.random(size, dtype=dtype)
        z = cp.empty_like(a)
        cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))

        # Establish a stream order
        stream1.wait(cp_stream1)

        # Launch the subtract kernel on GPU 1 / stream 1
        launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))

        # Synchronize both GPUs and validate the results
        dev0.set_current()
        stream0.sync()
        assert cp.allclose(c, a + b)
        dev1.set_current()
        stream1.sync()
        assert cp.allclose(z, x - y)

        print("done")
    finally:
        if cp_stream1 is not None:
            cp_stream1.close()
        if cp_stream0 is not None:
            cp_stream0.close()
        if stream1 is not None:
            stream1.close()
        stream0.close()


if __name__ == "__main__":
    main()