Opencl 1pp PDF
Opencl 1pp PDF
Mike Bailey
[email protected]
OpenCL
or
NVIDIA code
AMD Compiler
and Linker
OpenCL for
AMD/ATI GPU
Systems
y
NVIDIA Compiler
and Linker
OpenCL for
NVIDIA GPU
Systems
y
Intel code
Intel Compiler and
Linker
OpenCL for
Intel
Systems
y
The OpenCL
Programming Environment
OpenCL code
Compiler
p
and Linker
Compiler
p
and Linker
CPU
C
Ub
binary
ayo
on
the host
Ope C b
OpenCL
binary
ay
on the GPU
If you were
writing
g in C/C++,,
you would say:
void
ArrayMult( int n, float *a, float *b, float *c)
{
f ( int
for
i t i = 0;
0 i < n; i++ )
c[i] = a[i] * b[i];
}
kernel
void
ArrayMult( global float *dA, global float *dB, global float *dC)
{
int gid = get_global_id ( 0 );
dC[gid] = dA[gid] * dB[gid];
}
OpenCL code can be vector-oriented, meaning that it can perform a single instruction on
multiple data values at the same time (SIMD).
Vector data types are: charn, intn, floatn, where n = 2, 4, 8, or 16.
float4 f, g;
f = (float4)( 1
1.f,
f 2
2.f,
f 3
3.f,
f 4
4.ff );
float16 a16, x16, y16, z16;
ff.xx = 0
0.;;
f.xy = g.zw;
x16.s89ab = f;
float16 a16 = x16 * y16 + z16;
(Note: just because the language supports it, doesnt mean the hardware does.)
Oregon State University
Computer Graphics
mjb April 24, 2014
Device
CU
CU
CU
CU
CU
CU
PE
PE
PE
PE
PE
PE
PE
PE
PE
PE
PE
PE
PE
PE
PE
Grid
Work-Group Work-Group Work-Group
0
1
2
Work-Group 4
WorkWorkWorkLocal
Memory
ItemWork- Item Work- Item
WorkItem
Item
Item
WorkWorkWorkItem
Item
Item
WorkWorkWorkItem
Item
Item
Private
Memory
Private
Memory
Private
Memory
Oregon State University
Computer Graphics
Rules
Threads can share memory with the other Threads in the same Work-Group
Threads can synchronize with other Threads in the same Work-Group
Global and Constant memory is accessible by all Threads in all Work-Groups
Global and Constant memory is often cached inside a Work-Group
Each Thread has registers and private memory
Each
E h Work-Group
W kG
has
h a maximum
i
number
b off registers
i t
it can use. Th
These are
divided equally among all its Threads
status = clGetPlatformIDs(
How many
to get
Where to
put them
0,
NULL,
&numPlatforms );
NULL
);
CL_SUCCESS
CL
SUCCESS
CL_DEVICE_NOT_FOUND
CL_DEVICE_NOT_AVAILABLE
CL_COMPILER_NOT_AVAILABLE
CL_MEM_OBJECT_ALLOCATION_FAILURE
CL_OUT_OF_RESOURCES
CL_OUT_OF_HOST_MEMORY
CL_PROFILING_INFO_NOT_AVAILABLE
CL_MEM_COPY_OVERLAP
CL IMAGE FORMAT MISMATCH
CL_IMAGE_FORMAT_MISMATCH
CL_IMAGE_FORMAT_NOT_SUPPORTED
CL_BUILD_PROGRAM_FAILURE
CL_MAP_FAILURE
CL_INVALID_VALUE
CL_INVALID_DEVICE_TYPE
CL_INVALID_PLATFORM
CL_INVALID_DEVICE
CL_INVALID_CONTEXT
CL_INVALID_QUEUE_PROPERTIES
CL
INVALID QUEUE PROPERTIES
CL_INVALID_COMMAND_QUEUE
CL_INVALID_HOST_PTR
CL_INVALID_MEM_OBJECT
CL_INVALID_IMAGE_FORMAT_DESCRIPTOR
CL INVALID IMAGE SIZE
CL_INVALID_IMAGE_SIZE
CL_INVALID_SAMPLER
CL_INVALID_BINARY
CL_INVALID_BUILD_OPTIONS
CL_INVALID_PROGRAM
_
_
CL_INVALID_PROGRAM_EXECUTABLE
CL_INVALID_KERNEL_NAME
CL_INVALID_KERNEL_DEFINITION
CL_INVALID_KERNEL
CL INVALID ARG INDEX
CL_INVALID_ARG_INDEX
CL_INVALID_ARG_VALUE
CL_INVALID_ARG_SIZE
CL_INVALID_KERNEL_ARGS
CL_INVALID_WORK_DIMENSION
A Way to Print OpenCL Error Codes get from the Class Web Site
struct errorcode
{
cl int
cl_int
statusCode;
char *
meaning;
}
ErrorCodes[ ] =
{
{ CL_SUCCESS,
{ CL_DEVICE_NOT_FOUND,
{ CL_DEVICE_NOT_AVAILABLE,
"
"Device Not Found"
"Device Not Available"
},
},
},
},
},
...
{ CL_INVALID_MIP_LEVEL,
{ CL_INVALID_GLOBAL_WORK_SIZE,
};
void
PrintCLError( cl_int errorCode, char * prefix, FILE *fp )
{
if( errorCode == CL
CL_SUCCESS
SUCCESS )
return;
const int numErrorCodes = sizeof( ErrorCodes ) / sizeof( struct errorcode );
char * meaning = ";
for(( int i = 0;; i < numErrorCodes;; i++ )
{
if( errorCode == ErrorCodes[i].statusCode )
{
meaning = ErrorCodes[i].meaning;
break;
}
}
Oregon Statefprintf(
University
fp, "%s %s\n", prefix, meaning );
Computer
Graphics
}
// find out how many devices are attached to each platform and get their ids:
status = clGetDeviceIDs( platform, CL_DEVICE_TYPE_ALL, 0,
NULL, &numDevices );
NULL
);
cl_device_id device;
status = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device,
NULL
);
switch( type )
{
case CL_DEVICE_TYPE_CPU:
fprintf( OUTPUT,
OUTPUT "CL_DEVICE_TYPE_CPU\n"
"CL DEVICE TYPE CPU\n" );
break;
case CL_DEVICE_TYPE_GPU:
fprintf( OUTPUT, "CL_DEVICE_TYPE_GPU\n" );
break;
case CL
CL_DEVICE_TYPE_ACCELERATOR:
DEVICE TYPE ACCELERATOR:
fprintf( OUTPUT, "CL_DEVICE_TYPE_ACCELERATOR\n" );
break;
default:
fprintf( OUTPUT, "Other...\n" );
break;
}
clGetDeviceInfo( devices[i], CL_DEVICE_VENDOR_ID, sizeof(ui), &ui, NULL );
fprintf( OUTPUT, "\t\tDevice Vendor ID = 0x%04x\n", ui );
clGetDeviceInfo( devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(ui), &ui, NULL );
fprintf( OUTPUT
OUTPUT, "\t\tDevice
\t\tDevice Maximum Compute Units = %d\n
%d\n", ui );
clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(ui), &ui, NULL );
fprintf( OUTPUT, "\t\tDevice Maximum Work Item Dimensions = %d\n", ui );
clGetDeviceInfo( devices[i], CL
CL_DEVICE_MAX_WORK_ITEM_SIZES,
DEVICE MAX WORK ITEM SIZES, sizeof(sizes), sizes, NULL );
fprintf( OUTPUT, "\t\tDevice Maximum Work Item Sizes = %d x %d x %d\n", sizes[0], sizes[1], sizes[2] );
clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size), &size, NULL );
fprintf( OUTPUT, "\t\tDevice Maximum Work Group Size = %d\n", size );
clGetDeviceInfo( devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(ui), &ui, NULL );
fprintf( OUTPUT, "\t\tDevice Maximum Clock Frequency = %d MHz\n", ui );
}
} Oregon State University
Computer Graphics
Number of Platforms = 1
Platform #0:
Name = 'NVIDIA CUDA'
Vendor = 'NVIDIA Corporation'
V i = 'O
Version
'OpenCL
CL 1.1
1 1 CUDA 4
4.1.1'
1 1'
Profile = 'FULL_PROFILE'
Device #0:
Type = 0x0004 = CL_DEVICE_TYPE_GPU
Device Vendor ID = 0x10de
Device Maximum Compute Units = 15
Device Maximum Work Item Dimensions = 3
Device Maximum Work Item Sizes = 1024 x 1024 x 64
Device Maximum Work Group Size = 1024
Device Maximum Clock Frequency = 1401 MHz
Kernel Maximum Work Group Size = 1024
Kernel Compile Work Group Size = 0 x 0 x 0
Kernel Local Memory Size = 0
size_t extensionSize;
clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS,
0,
NULL,
&extensionSize );
char *extensions
extensions = new char [extensionSize];
clGetDeviceInfo( devices, CL_DEVICE_EXTENSIONS, extensionSize, extensions,
NULL );
fprintf( stderr, "\nDevice Extensions:\n" );
for( int i = 0; i < (int)strlen(extensions); i++ )
{
if( extensions[ i ] == ' ' )
extensions[ i ] = '\n';
}
fprintf( stderr, "%s\n",
%s\n , extensions );
delete [ ] extensions;
Device Extensions:
cl_khr_byte_addressable_store
cl_khr_icd
cl_khr_gl_sharing
cl_nv_d3d9_sharing
cl_nv_d3d10_sharing
cl_khr_d3d10_sharing
g
cl_nv_d3d11_sharing
cl_nv_compiler_options
cl_nv_device_attribute_query
cl_nv_p
pragma
g _unroll
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_fp64
1. Program header
2 Allocate the host memory buffers
2.
3. Create an OpenCL context
4. Create an OpenCL command queue
5. Allocate the device memoryy buffers
6. Write the data from the host buffers to the device buffers
7. Read the kernel code from a file
8. Compile and link the kernel code
9 Create the kernel object
9.
10. Setup the arguments to the kernel object
11. Enqueue the kernel object for execution
12. Read the results buffer back from the device to the host
13. Clean everything up
#include
#i
l d <
<stdio.h>
tdi h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#i l d <
#include
<omp.h>
h>
// for
f timing
ti i
#include "cl.h"
y buffers:
// fill the host memory
for( int i = 0; i < NUM_ELEMENTS; i++ )
{
hA[ i ] = hB[ i ] = sqrt( (float) i );
}
// array size in bytes (will need this later):
size_t dataSize = NUM_ELEMENTS * sizeof( float );
// opencl function return status:
cl_int status;
properties
// create a context:
the
device
Pass in
user data
Callback
returned
status
the
context
properties
returned
status
returned
status
6. Write the Data from the Host Buffers to the Device Buffers
command
queue
want to block
until done?
# bytes
offset
host
buffer
event wait
list
Read
Buffer dC
Execute
Kernel
Write
Buffer dB
Write
Buffer dA
kernel
void
ArrayMult( global const float *dA, global const float *dB, global float *dC )
{
int gid = get_global_id( 0 );
dC[gid] = dA[gid] * dB[gid];
}
GPU
Application
pp
Program
OpenCL Driver
does the
Compile and Link
OpenCL code in
a separate file
kernel void
ArrayMult( global float *A, global float *B, global float *C )
{
int gid = get_global_id
get global id ( 0 );
C[gid] = A[gid] * B[gid];
}
GPU
Application
pp
Program
GLSL Driver
does the
Compile and Link
void main( )
{
vec3 newcolor = texture2D( uTexUnit, vST) ).rgb;
newcolor = mix( newcolor, vColor.rgb, uBlend );
gl_FragColor = vec4(u LightIntensity*newcolor, 1. );
}
Oregon State University
Computer Graphics
mjb April 24, 2014
# events
event object
event wait
list
Lx = 4
GlobalIndexSpaceSize
# WorkGroups
#WorkGroups
WorkGroupSize
5x4
20
4
Gy = 12
Wy = 4
NDRange
NDRange Index Space
Space can be
1D, 2D, or 3D. This one is 2D.
Gx = 20
Ly = 3
Wx = 5
GlobalIndexSpaceSize
# WorkGroups
#WorkGroups
WorkGroupSize
5x4
20 x12
4 x3
Lx = 4
uint
get_work_dim( ) ;
size t
size_t
size_t
size_t
size_t
size_t
size_t
size_t
12. Read the Results Buffer Back from the Device to the Host
command
queue
want to block
until done?
# bytes
offset
host
buffer
event wait
list
kernel );
program );
cmdQueue );
);
);
);
delete [ ] hA;
delete [ ] hB;
delete [ ] hC;
GigaMultiplications/Second
GigaMultiplications/Second
Work-Group Size
Array Size (K)
Oregon State University
Computer Graphics
mjb April 24, 2014
size_t binary_sizes;
status = clGetProgramInfo( Program, CL_PROGRAM_BINARY_SIZES, 0, NULL, &binary_sizes );
size_t
_ size;
status = clGetProgramInfo( Program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
unsigned char *binary = new unsigned char [ size ];
status = clGetProgramInfo( Program, CL_PROGRAM_BINARIES, size, &binary, NULL );
FILE *fpbin = fopen( "particles.nv", "wb" );
if( fpbin == NULL )
{
fprintf( stderr, "Cannot create 'particles.bin'\n" );
}
else
{
fwrite( binary, 1, size, fpbin );
fclose( fpbin );
}
delete [ ] binary;
char * strings [ 1 ];
strings[0] = clProgramText;
cl_program program = clCreateProgramWithSource( context, 1, (const char **)strings, NULL, &status );
delete [ ] clProgramText;
d l t [ ] byteArray;
delete
b t A