The persistence of memory Parallel Programming 2012 Dan R. Ghica // Univ. of Birmingham
1
Structure of CUDA memory Reg
Reg
Reg
Reg
Reg
Reg
Thread Thread Thread
Thread Thread Thread
Shared memory
Shared memory
Block
Block Global Memory Constant Memory
Grid 2
CUDA var type qualifiers Declaration
Memory
Scope Lifetime
__local__ int x;
local
thread
thread
__shared__ int x;
shared
block
block
__device__ int x;
global
grid
app
__constant__ int x;
constant
grid
app
3
Where to put a var? Can host access it? global constant
yes
no
Outside of any Function
shared local register/automatic
In the kernel 4
Var restrictions • stored pointers can only be to global memory • but shared vars can be passed by reference (!) • pointers always allocated from host • pointers can only be passed as args to kernel • pointers can be taken from global vars • linked data structures in the device memory rarely make sense
• mostly, stick to arrays as the basic data struct 5
Tiling : processing huge datasets
kernel
6
Tiling : processing huge datasets
block
block
block
block
block
block
kernel block
block
7
Common kernel pattern • copy tile from global to shared memory • __syncthreads() • process data • __syncthreads() • copy tile from shared to global memory 8
Co-operative tile loading K threads load K locations in one step!
9
Matrix multiplication
10
But first, lets discuss the lab! student 1
student 2
throughput
952215
958518
3x
961192
954550
2x
1003998
1025266
4x
953461
937260
45x
11
hard for CPUs
Matrix multiplication
12
X
a
=
b
c
c[i][j] = a[i][0]*b[0][j] + ... + a[i][n-1]*b[n-1][j] 13
No shared memory __global__ void axb_in_c(float* a, float* b, float* c) { int t_x = threadIdx.x; int t_y = threadIdx.y; int b_x = blockIdx.x * BLOCK_SIZE; int a_y = blockIdx.y * BLOCK_SIZE; // c[b_x+t_s][a_y+t_y] = 0.0; c[b_x+t_x+SIZE*(a_y+t_y)] = 0.0; for(int a_x = 0; a_x < SIZE; a_x += 1) { int b_y = a_x; // c[b_x+t_x][a_y+t_y] += a[a_x][a_y+t_y]*b[b_x+t_x][b_y] c[b_x+t_x+SIZE*(a_y+t_y)] += a[a_x+SIZE*(a_y+t_y)]*b[b_x+t_x+SIZE*b_y]; __syncthreads(); } }
14
Performance 27x Matrix size : 320 improvement not Grid size : 20 bad! Block size : 16 Run 20 Kernels. Device Throughput = 13.1059 GFlop/s Run host. Host Throughput = 0.4821 GFlop/s
15
X
a
=
b
c
C[i][j] = A[i][0] X B[0][j] + ... + A[i][n/p] X B[n/p][j]
(algebra...) 16
1 2
X 1
2
3
4
= 3
acc
4
Every block works on one tile in the result. Every thread works on one element in the result. Every tile in the result needs p tiles in each argument. 17
Multiplying two tiles __device__ void tile_mult (float* t_a, // pointer to A float* t_b, // pointer to B float* t_c, // pointer to C int t_x, // thread id on x int t_y) // thread id on y { for(int i = 0; i < BLOCK_SIZE; i++) //t_c[t_x][t_y] += t_a[i][t_y] * t_b[t_x][i]; t_c[t_x+BLOCK_SIZE*t_y] += t_a[i+BLOCK_SIZE*t_y] * t_b[t_x+BLOCK_SIZE*i]; } 18
Understanding the coordinates bx (bx, ay) ay A
B
C
19
The kernel __global__ void axb_in_c(float* a, float* b, float* c) { __shared__ float s_a[BLOCK_SIZE * BLOCK_SIZE], s_b[BLOCK_SIZE * BLOCK_SIZE], s_c[BLOCK_SIZE * BLOCK_SIZE]; int t_x = threadIdx.x; int t_y = threadIdx.y; int b_x = blockIdx.x * BLOCK_SIZE; int a_y = blockIdx.y * BLOCK_SIZE; s_c[t_x + BLOCK_SIZE * t_y] = 0.0; for(int a_x = 0; a_x < SIZE; a_x += BLOCK_SIZE) { int b_y = a_x; // Load A and B tiles from global // s_a[t_x][t_y] = a[a_x+t_x][a_y+t_y]; s_a[t_x + BLOCK_SIZE * t_y] = a[a_x + t_x + SIZE * (a_y + t_y)]; // s_b[t_x][t_y] = a[b_x+t_x][b_y+t_y]; s_b[t_x + BLOCK_SIZE * t_y] = b[b_x + t_x + SIZE * (b_y + t_y)]; ... 20
The kernel cont’d ... __syncthreads(); tile_mult(s_a, s_b, s_c, t_x, t_y); __syncthreads(); // Load C tile to global // c[c_x+t_x][c_y+t_y] = s_c[t_x][t_y]; c[b_x + t_x + SIZE *(a_y + t_y)] = s_c[t_x + BLOCK_SIZE * t_y]; __syncthreads(); } }
21
What (if any) barriers not needed? ... s_c[t_x + BLOCK_SIZE * t_y] = 0.0; for(int a_x = 0; a_x < SIZE; a_x += BLOCK_SIZE) { int b_y = a_x; s_a[t_x + BLOCK_SIZE * t_y] = a[a_x + t_x + SIZE * (a_y + t_y)]; s_b[t_x + BLOCK_SIZE * t_y] = b[b_x + t_x + SIZE * (b_y + t_y)]; __syncthreads(); tile_mult(s_a, s_b, s_c, t_x, t_y); __syncthreads(); c[b_x + t_x + SIZE *(a_y + t_y)] = s_c[t_x + BLOCK_SIZE * t_y]; __syncthreads(); }
22
Benchmark dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid(GRID_SIZE, GRID_SIZE); 80x>>> axb_in_c<<
Blocksize vs. throughput Block size
Throughput
2
1.45
4
8.08
8
37.85
16
36.00
32
33.70
64
(> 48 KB SM)
size = 256 x 256 24
Tesla C2070: Shared Memory vs. Threading • • • • • • • • •
Each SM has 48KB shared memory (implem. dep.) 14 x 32 = 448 cores, 448 x 4 = 1,792 threads For tiles of size 16, each thread has 3*256*4B = 3KB of shared memory and 256 threads we can have 1792/256td = 7 blocks (even) active at the same time This means up to 7b*2ld*256td = 3,584 pending loads. A block size of 32 means 3*32*32*4B= 12KB shared memory and 1024 tds 1792/1024 = 1 active block, 1*2*32*32=2,048 pending loads tiles of size 16 achieve better throughput to memory un-tiled (block of 16) has 769 loads per thread (!) 25
Loop unrolling #pragma unroll for(int i = 0; i < BLOCK_SIZE; i++) ... #pragma unroll for(int a_x = 0; a_x < SIZE; a_x += BLOCK_SIZE) ...
wow!
blocksize
no unroll
unroll
8
37
39
16
36
59
32
33
63
26
Global memory access • not cached : managing it is crucial • access to it is very costly • data must be aligned for efficient access
• memory access in half-warps can be coalesced into one transaction
Chapter 5. Performance Guidelines
• coalescing is device-dependent
"#$!%&&'())!*+!%!,%'-%./(!'()-&-#0!-#!0/*.%/!1(1*'$!*'!'(23'#(&!.$!*#(!*+!24(! 1(1*'$!%//*5%2-*#!'*32-#()!+'*1!24(!&'-,('!*'!'3#2-1(!"67!-)!%/8%$)!%/-0#(&!2*!%2! /(%)2!9:;!.$2()8%'?!@&3'-#0!24(!(A(532-*#!*+!%!)-#0/(!'(%&!*'! 8'-2(!-#)2'352-*#B!5%#!.(!!"#$%&!%'!-#2*!%!)-#0/(!1(1*'$!2'%#)%52-*#!*+!C9=!;D=!*'!E9F! .$2()
G4(!'()2!*+!24-)!)(52-*#!&()5'-.()!24(!,%'-*3)!'(H3-'(1(#2)!+*'!1(1*'$!%55())()!2*! 5*%/()5(!.%)(&!*#!24(!5*1?32(!5%?%.-/-2$!*+!24(!&(,-5(8%'?!+3/+-//)!24()(! '(H3-'(1(#2)=!5*%/()5-#0!-)!%54-(,(&!(,(#!-+!24(!8%'?!-)!&-,('0(#2!%#&!)*1(!24'(%&)! *+!24(!4%/+>8%'?!&*!#*2!%523%//$!%55())!1(1*'$
historical interest
I*'!24(!?3'?*)(!*+!24(!+*//*8-#0!&-)53))-*#=!0/*.%/!1(1*'$!-)!5*#)-&('(&!2*!.(! ?%'2-2-*#(&!-#2*!)(01(#2)!*+!)-J(!(H3%/!2*!C9=!;D=!*'!E9F!.$2()!#('!%/-0#(&!2*!24-)! and curiosity )-J(
'$()"*#+%,-$%-."/+#"*-0+12-'$3451"-'(4(6+)+17-89:-(%&-898!G4(!0/*.%/!1(1*'$!%55())!.$!%//!24'(%&)!*+!%!4%/+>8%'?!-)!5*%/()5(&!-#2*!*#(!*'!28*! 1(1*'$!2'%#)%52-*#)!-+!-2!)%2-)+-()!24(!+*//*8-#0!24'((!5*#&-2-*#)K! G4'(%&)!13)2!%55())! ! L-24('!C9>.-2!8*'&)=!'()3/2-#0!-#!*#(!;D>.$2(!1(1*'$!2'%#)%52-*#=! ! M'!;D>.-2!8*'&)=!'()3/2-#0!-#!*#(!E9F>.$2(!1(1*'$!2'%#)%52-*#=! ! M'!E9F>.-2!8*'&)=!'()3/2-#0!-#!28*!E9F>.$2(!1(1*'$!2'%#)%52-*#)N! ! "//!E;!8*'&)!13)2!/-(!-#!24(!)%1(!)(01(#2!*+!)-J(!(H3%/!2*!24(!1(1*'$! 2'%#)%52-*#!)-J(!@*'!28-5(!24(!1(1*'$!2'%#)%52-*#!)-J(!84(#!%55())-#0!E9F>.-2! 8*'&)BN! ! G4'(%&)!13)2!%55())!24(!8*'&)!-#!)(H3(#5(K!G4(!)24!24'(%&!-#!24(!4%/+>8%'?!13)2! %55())!24(!)24!8*'&8%'?!&*()!#*2!+3/+-//!%//!24(!'(H3-'(1(#2)!%.*,(=!%!)(?%'%2(!1(1*'$! 2'%#)%52-*#!-)!-))3(&!+*'!(%54!24'(%&!%#&!24'*304?32!-)!)-0#-+-5%#2/$!'(&35(&
I-03'(!:>E!)4*8)!)*1(!(A%1?/()!*+!5*%/()5(&!1(1*'$!%55())()=!84-/(!I-03'(!:>9! %#&!I-03'(!:>C!)4*8!)*1(!(A%1?/()!*+!1(1*'$!%55())()!24%2!%'(!#*#>5*%/()5(&!+*'! &(,-5()!*+!5*1?32(!5%?%.-/-2$!E.-2!%55())()!&(/-,('!%!/-22/(!/*8('!.%#&8-&24!24%#!5*%/()5(&!C9>.-2! %55())()!%#&!5*%/()5(&!E9F>.-2!%55())()!&(/-,('!%!#*2-5(%./$!/*8('!.%#&8-&24!24%#! 5*%/()5(&!C9>.-2!%55())()5*%/()5(&!%55())()!-)! %'*3#&!%#!*'&('!*+!1%0#-23&(!/*8('!24%#!+*'!5*%/()5(&!%55())()!84(#!24()(!
Chapter 5. Performance Guidelines
Thread 0
Address 128
Thread 0
Address 128
Thread 1
Address 132
Thread 1
Address 132
Thread 2
Address 136
Thread 2
Address 136
Thread 3
Address 140
Thread 3
Address 140
Thread 4
Address 144
Thread 4
Address 144
Thread 5
Address 148
Thread 5
Address 148
Thread 6
Address 152
Thread 6
Address 152
Thread 7
Address 156
Thread 7
Address 156
Thread 8
Address 160
Thread 8
Address 160
Thread 9
Address 164
Thread 9
Address 164
Thread 10
Address 168
Thread 10
Address 168
Thread 11
Address 172
Thread 11
Address 172
Thread 12
Address 176
Thread 12
Address 176
Thread 13
Address 180
Thread 13
Address 180
Thread 14
Address 184
Thread 14
Address 184
Thread 15
Address 188
Thread 15
Address 188
! Left: coalesced float memory access, resulting in a single memory transaction.
Chapter 5. Performance Guidelines 29warp), resulting in a single memory transaction. Right: coalesced float memory access (divergent
Figure 5-1.Examples of Coalesced Global Memory Access Patterns
!
Thread 0
Address 128
Thread 1
Address 132
Thread 2
Address 136
Thread 2
Address 136
Thread 3
Address 140
Thread 3
Address 140
Thread 4
Address 144
Thread 4
Address 144
Thread 5
Address 148
Thread 5
Address 148
Thread 6
Address 152
Thread 6
Address 152
Thread 7
Address 156
Thread 7
Address 156
Thread 8
Address 160
Thread 8
Address 160
Thread 9
Address 164
Thread 9
Address 164
Thread 10
Address 168
Thread 10
Address 168
Thread 11
Address 172
Thread 11
Address 172
Thread 12
Address 176
Thread 12
Address 176
Thread 13
Address 180
Thread 13
Address 180
Thread 14
Address 184
Thread 14
Address 184
Thread 15
Address 188
Thread 15
Address 188
84
Thread 0
Address 128
Thread 1 Address 132 CUDA Programming Guide Version 2.2.1!
! Left: non-sequential float memory access, resulting in 16 memory transactions.
Right: access with a misaligned starting address, 30 resulting in 16 memory transactions.
Figure 5-2.Examples of Global Memory Access Patterns That Are Non-Coalesced for Devices of Compute Capability 1.0 or 1.1
5*%/()5(!.%)(&!*#!24(!5*1?32(!5%?%.-/-2$!*+!24(!&(,-5(8%'?!+3/+-//)!24()(! '(H3-'(1(#2)=!5*%/()5-#0!-)!%54-(,(&!(,(#!-+!24(!8%'?!-)!&-,('0(#2!%#&!)*1(!24'(%&)! *+!24(!4%/+>8%'?!&*!#*2!%523%//$!%55())!1(1*'$
I*'!24(!?3'?*)(!*+!24(!+*//*8-#0!&-)53))-*#=!0/*.%/!1(1*'$!-)!5*#)-&('(&!2*!.(! ?%'2-2-*#(&!-#2*!)(01(#2)!*+!)-J(!(H3%/!2*!C9=!;D=!*'!E9F!.$2()!#('!%/-0#(&!2*!24-)! )-J(
Address 128
Thread 1
Address 132
Thread 2
Address 136
Thread 0
Address 128
'$()"*#+%,-$%-."/+#"*-0+12-'$3451"-'(4(6+)+17-89:-(%&-898!Thread 3
Thread 1
Address 140
Address 140
G4(!0/*.%/!1(1*'$!%55())!.$!%//!24'(%&)!*+!%!4%/+>8%'?!-)!5*%/()5(&!-#2*!*#(!*'!28*! 1(1*'$!2'%#)%52-*#)!-+!-2!)%2-)+-()!24(!+*//*8-#0!24'((!5*#&-2-*#)K! Thread 4
Address 144
Thread 5
Address 148
G4'(%&)!13)2!%55())! ! L-24('!C9>.-2!8*'&)=!'()3/2-#0!-#!*#(!;D>.$2(!1(1*'$!2'%#)%52-*#=! ! M'!;D>.-2!8*'&)=!'()3/2-#0!-#!*#(!E9F>.$2(!1(1*'$!2'%#)%52-*#=! ! M'!E9F>.-2!8*'&)=!'()3/2-#0!-#!28*!E9F>.$2(!1(1*'$!2'%#)%52-*#)N! ! "//!E;!8*'&)!13)2!/-(!-#!24(!)%1(!)(01(#2!*+!)-J(!(H3%/!2*!24(!1(1*'$! 2'%#)%52-*#!)-J(!@*'!28-5(!24(!1(1*'$!2'%#)%52-*#!)-J(!84(#!%55())-#0!E9F>.-2! 8*'&)BN! ! G4'(%&)!13)2!%55())!24(!8*'&)!-#!)(H3(#5(K!G4(!)24!24'(%&!-#!24(!4%/+>8%'?!13)2! %55())!24(!)24!8*'&8%'?!&*()!#*2!+3/+-//!%//!24(!'(H3-'(1(#2)!%.*,(=!%!)(?%'%2(!1(1*'$! 2'%#)%52-*#!-)!-))3(&!+*'!(%54!24'(%&!%#&!24'*304?32!-)!)-0#-+-5%#2/$!'(&35(&
Thread 6
Address 152
Thread 7
Address 156
Thread 8
Address 160
Thread 9
Address 164
Thread 10
Address 168
Thread 11
Address 172
Thread 12
Address 176
Thread 13
Address 180
Thread 14
Address 184
Thread 15
Address 188
Thread 2
Address 152
Thread 3
Address 164
Thread 4
Address 176
Thread 5
Address 188
Left: non-contiguous float memory access, resulting in 16 memory transactions.
31
I-03'(!:>E!)4*8)!)*1(!(A%1?/()!*+!5*%/()5(&!1(1*'$!%55())()=!84-/(!I-03'(!:>9! Figure 5-3.Examples of Global Memory Access Patterns That %#&!I-03'(!:>C!)4*8!)*1(!(A%1?/()!*+!1(1*'$!%55())()!24%2!%'(!#*#>5*%/()5(&!+*'! Are Non-Coalesced for Devices of Compute Capability 1.0 or 1.1 &(,-5()!*+!5*1?32(!5%?%.-/-2$!E
!
! P*%/()5(&!;D>.-2!%55())()!&(/-,('!%!/-22/(!/*8('!.%#&8-&24!24%#!5*%/()5(&!C9>.-2! %55())()!%#&!5*%/()5(&!E9F>.-2!%55())()!&(/-,('!%!#*2-5(%./$!/*8('!.%#&8-&24!24%#! 5*%/()5(&!C9>.-2!%55())()5*%/()5(&!%55())()!-)! %'*3#&!%#!*'&('!*+!1%0#-23&(!/*8('!24%#!+*'!5*%/()5(&!%55())()!84(#!24()(! %55())()!%'(!C9>.-2=!-2!-)!*#/$!%'*3#&!+*3'!2-1()!/*8('!84(#!24($!%'(!;D>.-2!%#&! %'*3#&!28*!2-1()!84(#!24($!%'(!E9F>.-2
86
'$()"*#+%,-$%-."/+#"*-0+12-'$3451"-'(4(6+)+17-89;-(%&-<+,2"=G4(!0/*.%/!1(1*'$!%55())!.$!%//!24'(%&)!*+!%!4%/+>8%'?!-)!5*%/()5(&!-#2*!%!)-#0/(! 1(1*'$!2'%#)%52-*#!%)!)**#!%)!24(!8*'&)!%55())(&!.$!%//!24'(%&)!/-(!-#!24(!)%1(! )(01(#2!*+!)-J(!(H3%/!2*K! C9!.$2()!-+!%//!24'(%&)!%55())!F>.-2!8*'&)=! ! ;D!.$2()!-+!%//!24'(%&)!%55())!E;>.-2!8*'&)=! ! E9F!.$2()!-+!%//!24'(%&)!%55())!C9>.-2!*'!;D>.-2!8*'&)
CUDA Programming Guide Version 2.2.1!
2
32
Chapter 5. Performance Guidelines
! Address 120
Address 120
Address 96
Address 124
Address 124
Address 100
Thread 0
Address 128
Address 104
Thread 1
Address 132
Thread 1
Address 132
Address 108
Thread 2
Address 136
Thread 2
Address 136
Address 112
Thread 3
Address 140
Thread 3
Address 140
Address 116
Thread 4
Address 144
Thread 4
Address 144
Address 120
Thread 5
Address 148
Thread 5
Address 148
Thread 6
Address 152
Thread 6
Address 152
Thread 0
Address 128
Thread 7
Address 156
Thread 7
Address 156
Thread 1
Address 132
Thread 8
Address 160
Thread 8
Address 160
Thread 2
Address 136
Thread 9
Address 164
Thread 9
Address 164
Thread 3
Address 140
Thread 10
Address 168
Thread 10
Address 168
Thread 4
Address 144
Thread 11
Address 172
Thread 11
Address 172
Thread 5
Address 148
Thread 12
Address 176
Thread 12
Address 176
Thread 6
Address 152
Thread 13
Address 180
Thread 13
Address 180
Thread 7
Address 156
Thread 14
Address 184
Thread 14
Address 184
Thread 8
Address 160
Thread 15
Address 188
Thread 15
32B segment
Address 128
64B segment
Thread 0
Address 124
128B segment
Thread 9
Address 164
Address 192
Thread 10
Address 168
Address 196
Address 196
Thread 11
Address 172
Address 200
…
Thread 12
Address 176
Address 204
Address 204
Thread 13
Address 180
Address 208
Address 252
Thread 14
Address 184
Address
Address 256
Thread 15
Address
212
for same reason cache needs consecutive locations 64B segment
Address 188
Address 192
try to keep data at consecutive locations!
188
Left: random float memory access within a 64B segment, resulting in one memory transaction. Center: misaligned float memory access, resulting in one transaction. Right: misaligned float memory access, resulting in two transactions.
33
Figure 5-4.Examples of Global Memory Access by Devices with Compute Capability 1.2 and Higher ! 87!
CUDA Programming Guide Version 2.2.1
Shared memory access • low-latency • ... if no bank conflicts on write access
34
Memory banks core
core
core
core
mem
mem
mem
mem
35
No bank conflict core
core
core
core
mem
mem
mem
mem
36
Bank conflict x 2 core
core
core
core
mem
mem
mem
mem
37
Bank conflict x 4 core
core
core
core
mem
mem
mem
mem
38
No conflict on reads core
core
core
core
mem
mem
mem
mem
39
Bank conflict in CUDA • only threads in the same half-warp (16) can have bank conflicts • bank conflicts need to be serialized • n conflicts means n transactions • all threads are slowed down • successive 32-bit words assigned to successive banks • each bank can read one 32-bit word in 2 cycles
40
Striding the bank __shared__ float shared[32]; float data = shared[BaseIndex + s * tid];
• tid vs. tid+n : any bank conflicts? • what is a safe stride s? • any odd number 41
s=2
42
s=3
43
Banks and data size Chapter 5. Performance Guidelines
"#$%&!'()%)!*+$!,%-#.+-.-/!(&%!*$%-!%('$!#$&%(0!(''%))%)!(-!%1%,%-#!#$(#!.)! ),(11%&!+&!1(&/%&!#$(-!23!4.#)!.-!).5%6!7+&!%8(,91%:!#$%&%!(&%!4(-;!'+-<1.'#)!.! __shared__ char shared[32]; char data = shared[BaseIndex + tid];
4%'(?)%!shared[0]:!shared[1]:!shared[2]:!(-0!shared[3]:!<+&!%8(,91%:! 4%1+-/!#+!#$%!)(,%!4(-;6!@$%&%!(&%!-+!4(-;!'+-<1.'#)!$+*%A%&:!.! char data = shared[BaseIndex + 4 * tid];
@$%&%!(&%!(1)+!3B*(=!4(-;!'+-<1.'#)!<+&!(&&(=)!+! __shared__ double shared[32]; double data = shared[BaseIndex + tid];
).-'%!#$%!,%,+&=!&%C?%)#!.)!'+,9.1%0!.-#+!#*+!)%9(&(#%!23B4.#!&%C?%)#)6!"-%!*(=!#+! 44 (A+.0!4(-;!'+-<1.'#)!.-!#$.)!'()%!.)!#*+!)91.#!#$%!double!+9%&(-0)!1.;%!.-!#$%! <+11+*.-/!)(,91%!'+0%>! __shared__ int shared_lo[32]; __shared__ int shared_hi[32];
Chapter 5. Performance Guidelines
Some details are hairy...
"#$%&!&'%(!)&)*&+!#"!'%%&""&,!-#.(!'!".+#,&!/0!0#1&!*2.&"3!
4#$'5526!"('+&,!)&)/+2!'5"/!0&'.7+&"!'!*+/',%'".!)&%('$#")!-(&+&*2!'!89:*#.!-/+,! %'$!*&!+&',!'$,!*+/',%'".!./!"&1&+'5!.(+&',"!"#)75.'$&/7"52!-(&$!"&+1#%#$;!/$&! )&)/+2!+&',!+&<7&".3!=(#"!+&,7%&"!.(&!$7)*&+!/0!*'$>!%/$05#%."!-(&$!"&1&+'5! .(+&',"!/0!'!('50:-'+?!+&',!0+/)!'$!',,+&""!-#.(#$!.(&!"')&!89:*#.!-/+,3!@/+&! ?+&%#"&526!'!)&)/+2!+&',!+&<7&".!)',&!/0!"&1&+'5!',,+&""&"!#"!"&+1#%&,!#$!"&1&+'5! ".&?"!/1&+!.#)&!A!/$&!".&?!&1&+2!.-/!%5/%>!%2%5&"!A!*2!"&+1#%#$;!/$&!%/$05#%.:0+&&! "7*"&.!/0!.(&"&!',,+&""&"!?&+!".&?!7$.#5!'55!',,+&""&"!('1&!*&&$!"&+1#%&,B!'.!&'%(! ".&?6!.(&!"7*"&.!#"!*7#5.!0+/)!.(&!+&)'#$#$;!',,+&""&"!.('.!('1&!2&.!./!*&!"&+1#%&,! 7"#$;!.(&!0/55/-#$;!?+/%&,7+&C! D&5&%.!/$&!/0!.(&!-/+,"!?/#$.&,!./!*2!.(&!+&)'#$#$;!',,+&""&"!'"!.(&!*+/',%'".! -/+,6! ! E$%57,&!#$!.(&!"7*"&.C! ! F55!',,+&""&"!.('.!'+&!-#.(#$!.(&!*+/',%'".!-/+,6! ! G$&!',,+&""!0/+!&'%(!*'$>!?/#$.&,!./!*2!.(&!+&)'#$#$;!',,+&""&"3! H(#%(!-/+,!#"!"&5&%.&,!'"!.(&!*+/',%'".!-/+,!'$,!-(#%(!',,+&""!#"!?#%>&,!7?!0/+! “CUDA performance guidelines” 45 &'%(!*'$>!'.!&'%(!%2%5&!'+&!7$"?&%#0#&,3! !
F!%/))/$!%/$05#%.:0+&&!%'"&!#"!-(&$!'55!.(+&',"!/0!'!('50:-'+?!+&',!0+/)!'$!',,+&""! -#.(#$!.(&!"')&!89:*#.!-/+,3! 4#;7+&!I:J!"(/-"!"/)&!&K')?5&"!/0!)&)/+2!+&',!'%%&""&"!.('.!#$1/51&!.(&! *+/',%'".!)&%('$#")3!
How to optimise • start with straight forward solution • detect inefficiencies one at a time • high latencies, low utilisation,
divergences, bank conflicts etc.
• fix inefficiencies one at a time • combination of calculation and testing for tuning parameters
• repeat until out of ideas 46