Skip to content

Commit 59d5dea

Browse files
authored
Merge branch 'main' into 324-linker-example
2 parents 68c9365 + fc8be1c commit 59d5dea

File tree

4 files changed

+275
-46
lines changed

4 files changed

+275
-46
lines changed

cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@
5858
NUM_THREADS = 256
5959

6060

61+
def elems_to_bytes(nelems, dt):
62+
return nelems * np.dtype(dt).itemsize
63+
64+
6165
def main():
6266
print("CUDA Clock sample")
6367

@@ -75,31 +79,31 @@ def main():
7579
kernelHelper = common.KernelHelper(clock_nvrtc, devID)
7680
kernel_addr = kernelHelper.getFunction(b"timedReduction")
7781

78-
dinput = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.float32).itemsize * NUM_THREADS * 2))
79-
doutput = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.float32).itemsize * NUM_BLOCKS))
80-
dtimer = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.int64).itemsize * NUM_BLOCKS * 2))
81-
checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, np.dtype(np.float32).itemsize * NUM_THREADS * 2))
82+
dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
83+
doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
84+
dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
85+
checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
86+
87+
args = ((dinput, doutput, dtimer), (None, None, None))
88+
shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
8289

83-
arr = ((dinput, doutput, dtimer), (None, None, None))
90+
grid_dims = (NUM_BLOCKS, 1, 1)
91+
block_dims = (NUM_THREADS, 1, 1)
8492

8593
checkCudaErrors(
8694
cuda.cuLaunchKernel(
8795
kernel_addr,
88-
NUM_BLOCKS,
89-
1,
90-
1, # grid dim
91-
NUM_THREADS,
92-
1,
93-
1, # block dim
94-
np.dtype(np.float32).itemsize * 2 * NUM_THREADS,
96+
*grid_dims, # grid dim
97+
*block_dims, # block dim
98+
shared_memory_nbytes,
9599
0, # shared mem, stream
96-
arr,
100+
args,
97101
0,
98102
)
99103
) # arguments
100104

101105
checkCudaErrors(cuda.cuCtxSynchronize())
102-
checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, np.dtype(np.int64).itemsize * NUM_BLOCKS * 2))
106+
checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
103107
checkCudaErrors(cuda.cuMemFree(dinput))
104108
checkCudaErrors(cuda.cuMemFree(doutput))
105109
checkCudaErrors(cuda.cuMemFree(dtimer))

cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# this software and related documentation outside the terms of the EULA
77
# is strictly prohibited.
88
import ctypes
9-
import math
109
import sys
1110
import time
1211

@@ -103,18 +102,11 @@ def main():
103102
num_faces = 6
104103
num_layers = 1
105104
cubemap_size = width * width * num_faces
106-
size = cubemap_size * num_layers * np.dtype(np.float32).itemsize
107-
h_data = np.zeros(cubemap_size * num_layers, dtype="float32")
108-
109-
for i in range(cubemap_size * num_layers):
110-
h_data[i] = i
105+
h_data = np.arange(cubemap_size * num_layers, dtype="float32")
106+
size = h_data.nbytes
111107

112108
# This is the expected transformation of the input data (the expected output)
113-
h_data_ref = np.zeros(cubemap_size * num_layers, dtype="float32")
114-
115-
for layer in range(num_layers):
116-
for i in range(cubemap_size):
117-
h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer
109+
h_data_ref = np.repeat(np.arange(num_layers, dtype=h_data.dtype), cubemap_size) - h_data
118110

119111
# Allocate device memory for result
120112
d_data = checkCudaErrors(cudart.cudaMalloc(size))
@@ -130,10 +122,11 @@ def main():
130122
cudart.cudaArrayCubemap,
131123
)
132124
)
125+
width_nbytes = h_data[:width].nbytes
133126
myparms = cudart.cudaMemcpy3DParms()
134127
myparms.srcPos = cudart.make_cudaPos(0, 0, 0)
135128
myparms.dstPos = cudart.make_cudaPos(0, 0, 0)
136-
myparms.srcPtr = cudart.make_cudaPitchedPtr(h_data, width * np.dtype(np.float32).itemsize, width, width)
129+
myparms.srcPtr = cudart.make_cudaPitchedPtr(h_data, width_nbytes, width, width)
137130
myparms.dstArray = cu_3darray
138131
myparms.extent = cudart.make_cudaExtent(width, width, num_faces)
139132
myparms.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
@@ -211,23 +204,21 @@ def main():
211204
print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
212205

213206
# Allocate mem for the result on host side
214-
h_odata = np.zeros(cubemap_size * num_layers, dtype="float32")
207+
h_odata = np.empty_like(h_data)
215208
# Copy result from device to host
216209
checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
217210

218-
print("Comparing kernel output to expected data")
219-
MIN_EPSILON_ERROR = 5.0e-3
220-
for i in range(cubemap_size * num_layers):
221-
d = h_odata[i] - h_data_ref[i]
222-
if math.fabs(d) > MIN_EPSILON_ERROR:
223-
print("Failed")
224-
sys.exit(-1)
225-
print("Passed")
226-
227211
checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
228212
checkCudaErrors(cudart.cudaFree(d_data))
229213
checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
230214

215+
print("Comparing kernel output to expected data")
216+
MIN_EPSILON_ERROR = 5.0e-3
217+
if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR:
218+
print("Failed")
219+
sys.exit(-1)
220+
print("Passed")
221+
231222

232223
if __name__ == "__main__":
233224
main()

cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
def main():
3939
print("Vector Addition (Driver API)")
4040
N = 50000
41-
size = N * np.dtype(np.float32).itemsize
41+
nbytes = N * np.dtype(np.float32).itemsize
4242

4343
# Initialize
4444
checkCudaErrors(cuda.cuInit(0))
@@ -57,18 +57,18 @@ def main():
5757
_VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
5858

5959
# Allocate input vectors h_A and h_B in host memory
60-
h_A = np.random.rand(size).astype(dtype=np.float32)
61-
h_B = np.random.rand(size).astype(dtype=np.float32)
62-
h_C = np.random.rand(size).astype(dtype=np.float32)
60+
h_A = np.random.rand(N).astype(dtype=np.float32)
61+
h_B = np.random.rand(N).astype(dtype=np.float32)
62+
h_C = np.random.rand(N).astype(dtype=np.float32)
6363

6464
# Allocate vectors in device memory
65-
d_A = checkCudaErrors(cuda.cuMemAlloc(size))
66-
d_B = checkCudaErrors(cuda.cuMemAlloc(size))
67-
d_C = checkCudaErrors(cuda.cuMemAlloc(size))
65+
d_A = checkCudaErrors(cuda.cuMemAlloc(nbytes))
66+
d_B = checkCudaErrors(cuda.cuMemAlloc(nbytes))
67+
d_C = checkCudaErrors(cuda.cuMemAlloc(nbytes))
6868

6969
# Copy vectors from host memory to device memory
70-
checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, size))
71-
checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, size))
70+
checkCudaErrors(cuda.cuMemcpyHtoD(d_A, h_A, nbytes))
71+
checkCudaErrors(cuda.cuMemcpyHtoD(d_B, h_B, nbytes))
7272

7373
if True:
7474
# Grid/Block configuration
@@ -98,7 +98,7 @@ def main():
9898

9999
# Copy result from device memory to host memory
100100
# h_C contains the result in host memory
101-
checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, size))
101+
checkCudaErrors(cuda.cuMemcpyDtoH(h_C, d_C, nbytes))
102102

103103
for i in range(N):
104104
sum_all = h_A[i] + h_B[i]

0 commit comments

Comments
 (0)