cuda-python/cuda_core/examples/memory_ops.py at f03471d822914a6e84d7ac6a27125e7d1f6c8c19 · NVIDIA/cuda-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

# ################################################################################
#
# This example demonstrates memory resources for allocation and management,
# copying data between device and pinned memory, and DLPack interop. Requires
# NumPy 2.1.0+.
#
# ################################################################################

# /// script
# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc", "cupy-cuda13x"]
# ///

import sys

import cupy as cp
import numpy as np

from cuda.core import (
    Device,
    LaunchConfig,
    LegacyPinnedMemoryResource,
    Program,
    ProgramOptions,
    launch,
)

# Kernel for memory operations
code = """
extern "C"
__global__ void memory_ops(float* device_data,
                          float* pinned_data,
                          size_t N) {
    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        // Access device memory
        device_data[tid] = device_data[tid] + 1.0f;

        // Access pinned memory (zero-copy from GPU)
        pinned_data[tid] = pinned_data[tid] * 3.0f;
    }
}
"""


def main():
    if np.__version__ < "2.1.0":
        print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
        sys.exit(1)

    dev = Device()
    dev.set_current()
    stream = dev.create_stream()
    # tell CuPy to use our stream as the current stream:
    cp.cuda.ExternalStream(int(stream.handle)).use()
    device_buffer = None
    pinned_buffer = None
    new_device_buffer = None

    try:
        # Compile kernel
        program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
        prog = Program(code, code_type="c++", options=program_options)
        mod = prog.compile("cubin")
        kernel = mod.get_kernel("memory_ops")

        # Create different memory resources
        device_mr = dev.memory_resource
        pinned_mr = LegacyPinnedMemoryResource()

        # Allocate different types of memory
        size = 1024
        dtype = cp.float32
        element_size = dtype().itemsize
        total_size = size * element_size

        # 1. Device Memory (GPU-only)
        device_buffer = device_mr.allocate(total_size, stream=stream)
        device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)

        # 2. Pinned Memory (CPU memory, GPU accessible)
        pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
        pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)

        # Initialize data
        rng = cp.random.default_rng()
        device_array[:] = rng.random(size, dtype=dtype)
        pinned_array[:] = rng.random(size, dtype=dtype).get()

        # Store original values for verification
        device_original = device_array.copy()
        pinned_original = pinned_array.copy()

        # Sync before kernel launch
        stream.sync()

        # Launch kernel
        block = 256
        grid = (size + block - 1) // block
        config = LaunchConfig(grid=grid, block=block)

        launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
        stream.sync()

        # Verify kernel operations
        assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
        assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"

        # Copy data between different memory types
        print("\nCopying data between memory types...", file=sys.stderr)

        # Copy from device to pinned memory
        device_buffer.copy_to(pinned_buffer, stream=stream)
        stream.sync()

        # Verify the copy operation
        assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"

        # Create a new device buffer and copy from pinned
        new_device_buffer = device_mr.allocate(total_size, stream=stream)
        new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)

        pinned_buffer.copy_to(new_device_buffer, stream=stream)
        stream.sync()

        # Verify the copy operation
        assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"

        print("Memory management example completed!")
    finally:
        # Clean up resources even if verification fails.
        if new_device_buffer is not None:
            new_device_buffer.close(stream)
            assert new_device_buffer.handle == 0, "New device buffer should be closed"
        if pinned_buffer is not None:
            pinned_buffer.close(stream)
            assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
        if device_buffer is not None:
            device_buffer.close(stream)
            assert device_buffer.handle == 0, "Device buffer should be closed"
        stream.close()
        cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream


if __name__ == "__main__":
    main()