Introduction To CUDA C 3
Introduction To CUDA C 3
NVIDIA Corporation
© NVIDIA 2013
What is CUDA?
• CUDA Architecture
– Expose GPU parallelism for general-purpose computing
– Retain performance
• CUDA C/C++
– Based on industry-standard C/C++
– Small set of extensions to enable heterogeneous programming
– Straightforward APIs to manage devices, memory etc.
© NVIDIA 2013
Introduction to CUDA C/C++
• What will you learn in this session?
– Start from “Hello World!”
– Write and launch CUDA C/C++ kernels
– Manage GPU memory
– Manage communication and synchronization
© NVIDIA 2013
Prerequisites
• You (probably) need experience with C or C++
Blocks
Threads
Indexing
CONCEPTS
Shared memory
__syncthreads()
Asynchronous operation
Handling errors
Managing devices
© NVIDIA 2013
CONCEPTS Heterogeneous Computing
Blocks
Threads
Indexing
Shared memory
__syncthreads()
Asynchronous operation
HELLO WORLD!
Handling errors
Managing devices
Heterogeneous Computing
Terminology:
Host The CPU and its memory (host memory)
Device The GPU and its memory (device memory)
Host Device
© NVIDIA 2013
Heterogeneous Computing
#include
#include <iostream>
<iostream>
#include <algorithm>
#include <algorithm>
using
using namespace
namespace std;
std;
#define
#define N
N 1024
1024
#define RADIUS
#define RADIUS 33
#define BLOCK_SIZE
#define BLOCK_SIZE 16
16
__global__
__global__ void
void stencil_1d(int
stencil_1d(int *in,
*in, int
int *out)
*out) {{
__shared__
__shared__ int int temp[BLOCK_SIZE
temp[BLOCK_SIZE + +2 2 ** RADIUS];
RADIUS];
int
int gindex
gindex == threadIdx.x
threadIdx.x + + blockIdx.x
blockIdx.x ** blockDim.x;
blockDim.x;
int
int lindex
lindex =
= threadIdx.x
threadIdx.x + + RADIUS;
RADIUS;
//
// Read
Read input
input elements
elements into
into shared
shared memory
memory
temp[lindex] =
temp[lindex] = in[gindex];
in[gindex];
if (threadIdx.x
if (threadIdx.x <
< RADIUS)
RADIUS) {{
temp[lindex -- RADIUS]
temp[lindex RADIUS] =
= in[gindex
in[gindex --
RADIUS];
RADIUS];
temp[lindex
temp[lindex +
+ BLOCK_SIZE]
BLOCK_SIZE] =
=
in[gindex
in[gindex +
+ BLOCK_SIZE];
BLOCK_SIZE];
parallel fn
}}
//
// Synchronize
Synchronize (ensure
(ensure all
all the
the data
data is
is available)
available)
__syncthreads();
__syncthreads();
//
// Apply
Apply the
the stencil
stencil
int result
int result == 0;
0;
for
for (int
(int offset
offset == -RADIUS
-RADIUS ;; offset
offset <=
<= RADIUS
RADIUS ;; offset++)
offset++)
result
result +=
+= temp[lindex
temp[lindex +
+ offset];
offset];
//
// Store
Store the
the result
result
out[gindex] =
out[gindex] = result;
result;
}}
void
void fill_ints(int
fill_ints(int *x,
*x, int
int n)
n) {{
fill_n(x,
fill_n(x, n,
n, 1);
1);
}}
int
int main(void)
main(void) {{
int
int *in,
*in, *out;
*out; //
// host
host copies
copies of
of a,
a, b,
b, cc
int *d_in,
int *d_in, *d_out;
*d_out; //
// device
device copies
copies ofof a,
a, b,
b, cc
int size
int size == (N
(N +
+ 2*RADIUS)
2*RADIUS) ** sizeof(int);
sizeof(int);
//
// Alloc
Alloc space
space for
for host
host copies
copies andand setup
setup values
values
in =
in = (int
(int *)malloc(size);
*)malloc(size); fill_ints(in,
fill_ints(in, NN++ 2*RADIUS);
2*RADIUS);
out =
out = (int
(int *)malloc(size);
*)malloc(size); fill_ints(out,
fill_ints(out, N
N+ + 2*RADIUS);
2*RADIUS);
serial code
//
// Alloc
Alloc space
space for
for device
device copies
copies
cudaMalloc((void
cudaMalloc((void **)&d_in,
**)&d_in, size);
size);
cudaMalloc((void
cudaMalloc((void **)&d_out,
**)&d_out, size);
size);
//
// Copy
Copy to
to device
device
cudaMemcpy(d_in,
cudaMemcpy(d_in, in,
in, size,
size,
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out,
cudaMemcpy(d_out, out, size,
size,
cudaMemcpyHostToDevice);
cudaMemcpyHostToDevice);
//
// Launch
Launch stencil_1d()
stencil_1d() kernel
kernel on
on GPU
GPU
parallel code
stencil_1d<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in
stencil_1d<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in +
+
RADIUS,
RADIUS, d_out
d_out +
+ RADIUS);
RADIUS);
//
// Copy
Copy result
result back
back to
to host
host
cudaMemcpy(out,
cudaMemcpy(out, d_out,
d_out, size,
size,
cudaMemcpyDeviceToHost);
cudaMemcpyDeviceToHost);
serial code
//
// Cleanup
Cleanup
free(in); free(out);
free(in); free(out);
cudaFree(d_in);
cudaFree(d_in); cudaFree(d_out);
cudaFree(d_out);
return
return 0;
0;
}}
© NVIDIA 2013
Simple Processing Flow
PCI Bus
© NVIDIA 2013
Simple Processing Flow
PCI Bus
© NVIDIA 2013
Simple Processing Flow
PCI Bus
© NVIDIA 2013
Hello World!
int main(void) {
printf("Hello World!\n");
return 0;
}
Output:
Standard C that runs on the host
$ nvcc
hello_world.
NVIDIA compiler (nvcc) can be used cu
to compile programs with no device $ a.out
code Hello World!
$
© NVIDIA 2013
Hello World! with Device Code
__global__ void mykernel(void) {
}
int main(void) {
mykernel<<<1,1>>>();
printf("Hello World!\n");
return 0;
}
© NVIDIA 2013
Hello World! with Device Code
__global__ void mykernel(void) {
}
© NVIDIA 2013
Hello World! with Device COde
mykernel<<<1,1>>>();
© NVIDIA 2013
Hello World! with Device Code
__global__ void mykernel(void){
}
Output:
int main(void) {
mykernel<<<1,1>>>();
$ nvcc
printf("Hello World!\n");
hello.cu
return 0;
$ a.out
}
Hello World!
$
• mykernel() does nothing,
somewhat anticlimactic!
© NVIDIA 2013
Parallel Programming in CUDA C/C++
• But wait… GPU computing is about
massive parallelism!
a b c
© NVIDIA 2013
Addition on the Device
• A simple kernel to add two integers
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
© NVIDIA 2013
Addition on the Device
• Note that we use pointers for the variables
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
© NVIDIA 2013
Memory Management
• Host and device memory are separate entities
– Device pointers point to GPU memory
May be passed to/from host code
May not be dereferenced in host code
– Host pointers point to CPU memory
May be passed to/from device code
May not be dereferenced in device code
© NVIDIA 2013
Addition on the Device: add()
• Returning to our add() kernel
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
© NVIDIA 2013
Addition on the Device: main()
int main(void) {
int a, b, c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = sizeof(int);
© NVIDIA 2013
Addition on the Device: main()
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Cleanup
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
© NVIDIA 2013
CONCEPTS Heterogeneous Computing
Blocks
Threads
Indexing
Shared memory
__syncthreads()
Asynchronous operation
RUNNING IN
Handling errors
Managing devices
PARALLEL
© NVIDIA 2013
Moving to Parallel
• GPU computing is about massive parallelism
– So how do we run code in parallel on the device?
add<<< 1, 1 >>>();
add<<< N, 1 >>>();
© NVIDIA 2013
Vector Addition on the Device
• With add() running in parallel we can do vector addition
© NVIDIA 2013
Vector Addition on the Device: add()
• Returning to our parallelized add() kernel
__global__ void add(int *a, int *b, int *c) {
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
© NVIDIA 2013
Vector Addition on the Device: main()
#define N 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
© NVIDIA 2013
Vector Addition on the Device: main()
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
© NVIDIA 2013
Review (1 of 2)
• Difference between host and device
– Host CPU
– Device GPU
© NVIDIA 2013
Review (2 of 2)
• Basic device memory management
– cudaMalloc()
– cudaMemcpy()
– cudaFree()
© NVIDIA 2013
CONCEPTS Heterogeneous Computing
Blocks
Threads
Indexing
Shared memory
__syncthreads()
Asynchronous operation
INTRODUCING
Handling errors
Managing devices
THREADS
© NVIDIA 2013
CUDA Threads
• Terminology: a block can be split into parallel threads
© NVIDIA 2013
Vector Addition Using Threads: main()
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
© NVIDIA 2013
CONCEPTS Heterogeneous Computing
Blocks
Threads
Indexing
Shared memory
__syncthreads()
Asynchronous operation
Handling errors
COMBINING THREADS Managing devices
AND BLOCKS
© NVIDIA 2013
Combining Blocks and Threads
• We’ve seen parallel vector addition using:
– Many blocks with one thread each
– One block with many threads
© NVIDIA 2013
Indexing Arrays with Blocks and Threads
© NVIDIA 2013
Indexing Arrays: Example
• Which thread will operate on the red
element?
1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 3
M = 8 threadIdx.x = 5
7
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6
blockIdx.x = 2
© NVIDIA 2013
Addition with Blocks and Threads:
main()
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
© NVIDIA 2013
Handling Arbitrary Vector Sizes
• Typical problems are not friendly multiples of
blockDim.x
© NVIDIA 2013
Why Bother with Threads?
• Threads seem unnecessary
– They add a level of complexity
– What do we gain?
© NVIDIA 2013
CONCEPTS Heterogeneous Computing
Blocks
Threads
Indexing
Shared memory
__syncthreads()
Asynchronous operation
COOPERATING
Handling errors
Managing devices
THREADS
© NVIDIA 2013
1D Stencil
• Consider applying a 1D stencil to a 1D array of
elements
– Each output element is the sum of input elements within a
radius
radius radius
© NVIDIA 2013
Implementing Within a Block
• Each thread processes one output element
– blockDim.x elements per block
© NVIDIA 2013
Sharing Data Between Threads
• Terminology: within a block, threads share data via
shared memory
© NVIDIA 2013
Implementing With Shared Memory
• Cache data in shared memory
– Read (blockDim.x + 2 * radius) input elements from global
memory to shared memory
– Compute blockDim.x output elements
– Write blockDim.x output elements to global memory
© NVIDIA 2013
Stencil Kernel
// Apply the stencil
int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
result += temp[lindex + offset];
© NVIDIA 2013
Data Race!
The stencil example will not work…
Suppose thread 15 reads the halo before thread 0 has fetched it…
int result = 0;
result += temp[lindex + 1];
Load from temp[19]
© NVIDIA 2013
__syncthreads()
• void __syncthreads();
© NVIDIA 2013
Stencil Kernel
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + radius;
© NVIDIA 2013
Stencil Kernel
// Apply the stencil
int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
result += temp[lindex + offset];
© NVIDIA 2013
Review (1 of 2)
• Launching parallel threads
– Launch N blocks with M threads per block with
kernel<<<N,M>>>(…);
– Use blockIdx.x to access block index within grid
– Use threadIdx.x to access thread index within block
© NVIDIA 2013
Review (2 of 2)
• Use __shared__ to declare a variable/array in
shared memory
– Data is shared between threads in a block
– Not visible to threads in other blocks
© NVIDIA 2013
CONCEPTS Heterogeneous Computing
Blocks
Threads
Indexing
Shared memory
__syncthreads()
Asynchronous operation
Handling errors
MANAGING THE Managing devices
DEVICE
© NVIDIA 2013
Coordinating Host & Device
• Kernel launches are asynchronous
– Control returns to the CPU immediately
© NVIDIA 2013
Reporting Errors
• All CUDA API calls return an error code (cudaError_t)
– Error in the API call itself
OR
– Error in an earlier asynchronous operation (e.g. kernel)
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
© NVIDIA 2013
Device Management
• Application can query and select GPUs
cudaGetDeviceCount(int *count)
cudaSetDevice(int device)
cudaGetDevice(int *device)
cudaGetDeviceProperties(cudaDeviceProp *prop, int device)
• cudaMemcpy() vs cudaMemcpyAsync(),
cudaDeviceSynchronize()
© NVIDIA 2013
Compute Capability
• The compute capability of a device describes its architecture, e.g.
– Number of registers
– Sizes of memories
– Features & capabilities
Compute Selected Features Tesla models
Capability (see CUDA C Programming Guide for complete list)
1.0 Fundamental CUDA support 870
1.3 Double precision, improved memory accesses, 10-series
atomics
2.0 Caches, fused multiply-add, 3D grids, surfaces, ECC, 20-series
P2P,
concurrent kernels/copies, function pointers,
recursion
Block (1,1,0)
• Addressable as 1D, 2D or 3D
© NVIDIA 2013
Topics we skipped
• We skipped some details, you can learn more:
– CUDA Programming Guide
– CUDA Zone – tools, training, webinars and more
developer.nvidia.com/cuda
© NVIDIA 2013