-
Notifications
You must be signed in to change notification settings - Fork 277
Expand file tree
/
Copy pathmemory_ops.py
More file actions
149 lines (119 loc) · 4.92 KB
/
memory_ops.py
File metadata and controls
149 lines (119 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
# ################################################################################
#
# This example demonstrates memory resources for allocation and management,
# copying data between device and pinned memory, and DLPack interop. Requires
# NumPy 2.1.0+.
#
# ################################################################################
# /// script
# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc", "cupy-cuda13x"]
# ///
import sys
import cupy as cp
import numpy as np
from cuda.core import (
Device,
LaunchConfig,
LegacyPinnedMemoryResource,
Program,
ProgramOptions,
launch,
)
# Kernel for memory operations
code = """
extern "C"
__global__ void memory_ops(float* device_data,
float* pinned_data,
size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
// Access device memory
device_data[tid] = device_data[tid] + 1.0f;
// Access pinned memory (zero-copy from GPU)
pinned_data[tid] = pinned_data[tid] * 3.0f;
}
}
"""
def main():
if np.__version__ < "2.1.0":
print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
sys.exit(1)
dev = Device()
dev.set_current()
stream = dev.create_stream()
# tell CuPy to use our stream as the current stream:
cp.cuda.ExternalStream(int(stream.handle)).use()
device_buffer = None
pinned_buffer = None
new_device_buffer = None
try:
# Compile kernel
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile("cubin")
kernel = mod.get_kernel("memory_ops")
# Create different memory resources
device_mr = dev.memory_resource
pinned_mr = LegacyPinnedMemoryResource()
# Allocate different types of memory
size = 1024
dtype = cp.float32
element_size = dtype().itemsize
total_size = size * element_size
# 1. Device Memory (GPU-only)
device_buffer = device_mr.allocate(total_size, stream=stream)
device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)
# 2. Pinned Memory (CPU memory, GPU accessible)
pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)
# Initialize data
rng = cp.random.default_rng()
device_array[:] = rng.random(size, dtype=dtype)
pinned_array[:] = rng.random(size, dtype=dtype).get()
# Store original values for verification
device_original = device_array.copy()
pinned_original = pinned_array.copy()
# Sync before kernel launch
stream.sync()
# Launch kernel
block = 256
grid = (size + block - 1) // block
config = LaunchConfig(grid=grid, block=block)
launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
stream.sync()
# Verify kernel operations
assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"
# Copy data between different memory types
print("\nCopying data between memory types...", file=sys.stderr)
# Copy from device to pinned memory
device_buffer.copy_to(pinned_buffer, stream=stream)
stream.sync()
# Verify the copy operation
assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"
# Create a new device buffer and copy from pinned
new_device_buffer = device_mr.allocate(total_size, stream=stream)
new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)
pinned_buffer.copy_to(new_device_buffer, stream=stream)
stream.sync()
# Verify the copy operation
assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"
print("Memory management example completed!")
finally:
# Clean up resources even if verification fails.
if new_device_buffer is not None:
new_device_buffer.close(stream)
assert new_device_buffer.handle == 0, "New device buffer should be closed"
if pinned_buffer is not None:
pinned_buffer.close(stream)
assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
if device_buffer is not None:
device_buffer.close(stream)
assert device_buffer.handle == 0, "Device buffer should be closed"
stream.close()
cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream
if __name__ == "__main__":
main()