9
9
# 1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together
10
10
# 2. That linking allows for libraries to modify workflows dynamically at runtime
11
11
#
12
- # This demo mimics a relationship between a library and a user. The user's sole responsability is to
13
- # provide device code that generates art. Where as the library is responsible for all steps involved in
14
- # setting up the device, launch configurations and arguments as well as linking the provided device code.
12
+ # This demo mimics a relationship between a library and a user. The user's sole responsibility is to
13
+ # provide device code that generates some art. Whereas the library is responsible for all steps involved in
14
+ # setting up the device, launch configurations and arguments, as well as linking the provided device code.
15
15
#
16
16
# Two algorithms are implemented:
17
17
# 1. A Mandelbrot set
37
37
# The provided device code must contain a function with the signature `void generate_art(float* Data)`
38
38
class MockLibrary :
39
39
def __init__ (self ):
40
- # For this mock library, the main workflow is intentially kept simple by limiting itself to only calling the
40
+ # For this mock library, the main workflow is intentionally kept simple by limiting itself to only calling the
41
41
# externally defined generate_art function. More involved libraries have the option of applying pre and post
42
- # processing steps before calling user-defined device code. Conversely, these responsabilities can be reversed
42
+ # processing steps before calling user-defined device code. Conversely, these responsibilities can be reversed
43
43
# such that the library owns the bulk of the workflow while allowing users to provide customized pre/post
44
44
# processing steps.
45
45
code_main = r"""
@@ -61,33 +61,32 @@ def __init__(self):
61
61
62
62
# Most of the launch configurations can be preemptively done before the user provides their device code
63
63
# Therefore lets compile our main workflow device code now, and link the remaining pieces at a later time
64
- self .arch = "" .join (f"{ i } " for i in Device ().compute_capability )
65
- self .program_options = ProgramOptions (std = "c++11" , arch = f"sm_{ self .arch } " , relocatable_device_code = True )
64
+ self .program_options = ProgramOptions (relocatable_device_code = True )
66
65
self .main_object_code = Program (code_main , "c++" , options = self .program_options ).compile ("ptx" )
67
66
68
67
# Setup device state
69
68
self .dev = Device ()
70
69
self .dev .set_current ()
71
70
self .stream = self .dev .create_stream ()
72
71
73
- # Setup buffer to store our results
72
+ # Setup a buffer to store the RGBA results for the width and height specified
74
73
self .width = 1024
75
74
self .height = 512
76
75
self .buffer = cp .empty (self .width * self .height * 4 , dtype = cp .float32 )
77
76
78
77
# Setup the launch configuration such that each thread will be generating one pixel, and subdivide
79
78
# the problem into 16x16 chunks.
80
- self .grid = (self .width / 16 , self .height / 16 , 1 )
79
+ self .grid = (self .width / 16 , self .height / 16 , 1.0 )
81
80
self .block = (16 , 16 , 1 )
82
81
self .config = LaunchConfig (grid = self .grid , block = self .block , stream = self .stream )
83
82
84
83
def link (self , user_code , target_type ):
85
84
if target_type == "ltoir" :
86
- program_options = ProgramOptions (std = "c++11" , arch = f"sm_ { self . arch } " , link_time_optimization = True )
87
- linker_options = LinkerOptions (arch = f"sm_ { self . arch } " , link_time_optimization = True )
85
+ program_options = ProgramOptions (link_time_optimization = True )
86
+ linker_options = LinkerOptions (link_time_optimization = True )
88
87
elif target_type == "ptx" :
89
88
program_options = self .program_options
90
- linker_options = LinkerOptions (arch = f"sm_ { self . arch } " )
89
+ linker_options = LinkerOptions ()
91
90
else :
92
91
raise AssertionError
93
92
@@ -119,7 +118,7 @@ def run(self, kernel):
119
118
# http://en.wikipedia.org/wiki/Mandelbrot_set
120
119
#
121
120
# Note that this kernel is meant to be a simple, straight-forward
122
- # implementation, and so may not represent optimized GPU code.
121
+ # implementation. No attempt is made to optimize this GPU code.
123
122
code_mandelbrot = r"""
124
123
__device__
125
124
void generate_art(float* Data) {
@@ -168,18 +167,19 @@ def run(self, kernel):
168
167
B = (float)ColorB / 25.0f;
169
168
A = 1.0f;
170
169
171
- Data[DataY*Width*4+DataX*4+0] = R;
172
- Data[DataY*Width*4+DataX*4+1] = G;
173
- Data[DataY*Width*4+DataX*4+2] = B;
174
- Data[DataY*Width*4+DataX*4+3] = A;
170
+ unsigned i = DataY*Width*4+DataX*4;
171
+ Data[i+0] = R;
172
+ Data[i+1] = G;
173
+ Data[i+2] = B;
174
+ Data[i+3] = A;
175
175
}
176
176
"""
177
177
178
178
# Simple implementation of Julia set from Wikipedia
179
179
# http://en.wikipedia.org/wiki/Julia_set
180
180
#
181
181
# Note that this kernel is meant to be a simple, straight-forward
182
- # implementation, and so may not represent optimized GPU code.
182
+ # implementation. No attempt is made to optimize this GPU code.
183
183
code_julia = r"""
184
184
__device__
185
185
void generate_art(float* Data) {
@@ -224,15 +224,16 @@ def run(self, kernel):
224
224
B = (float)ColorB / 25.0f;
225
225
A = 1.0f;
226
226
227
- Data[DataY*Width*4+DataX*4+0] = R;
228
- Data[DataY*Width*4+DataX*4+1] = G;
229
- Data[DataY*Width*4+DataX*4+2] = B;
230
- Data[DataY*Width*4+DataX*4+3] = A;
227
+ unsigned i = DataY*Width*4+DataX*4;
228
+ Data[i+0] = R;
229
+ Data[i+1] = G;
230
+ Data[i+2] = B;
231
+ Data[i+3] = A;
231
232
}
232
233
"""
233
234
234
235
235
- if __name__ == "__main__" :
236
+ def main () :
236
237
# Parse command line arguments
237
238
# Two different kernels are implemented with unique algorithms, and the user can choose which one should be used
238
239
# Both kernels fulfill the signature required by the MockLibrary: `void generate_art(float* Data)`
@@ -296,3 +297,7 @@ def run(self, kernel):
296
297
plt .show ()
297
298
298
299
print ("done!" )
300
+
301
+
302
+ if __name__ == "__main__" :
303
+ main ()
0 commit comments