CUDA:工作负载遵循CPU预处理->GPU处理->CPU后处理的形式的实例
multithreading.h
#ifndef MULTITHREADING_H
#define MULTITHREADING_H
// Simple portable thread library.
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Windows threads.
#include <windows.h>
typedef HANDLE CUTThread;
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
struct CUTBarrier {
CRITICAL_SECTION criticalSection;
HANDLE barrierEvent;
int releaseCount;
int count;
};
#define CUT_THREADPROC unsigned WINAPI
#define CUT_THREADEND return 0
#else
// POSIX threads.
#include <pthread.h>
typedef pthread_t CUTThread;
typedef void *(*CUT_THREADROUTINE)(void *);
#define CUT_THREADPROC void *
#define CUT_THREADEND return 0
struct CUTBarrier {
pthread_mutex_t mutex;
pthread_cond_t conditionVariable;
int releaseCount;
int count;
};
#endif
#ifdef __cplusplus
extern "C" {
#endif
// Create thread.
CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
// Wait for thread to finish.
void cutEndThread(CUTThread thread);
// Wait for multiple threads.
void cutWaitForThreads(const CUTThread *threads, int num);
// Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount);
// Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier);
// Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier);
// Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // MULTITHREADING_H
multithreading.cpp
#include "multithreading.h"
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
}
// Wait for thread to finish
void cutEndThread(CUTThread thread) {
WaitForSingleObject(thread, INFINITE);
CloseHandle(thread);
}
// Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) {
WaitForMultipleObjects(num, threads, true, INFINITE);
for (int i = 0; i < num; i++) {
CloseHandle(threads[i]);
}
}
// Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) {
CUTBarrier barrier;
InitializeCriticalSection(&barrier.criticalSection);
barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
barrier.count = 0;
barrier.releaseCount = releaseCount;
return barrier;
}
// Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) {
int myBarrierCount;
EnterCriticalSection(&barrier->criticalSection);
myBarrierCount = ++barrier->count;
LeaveCriticalSection(&barrier->criticalSection);
if (myBarrierCount >= barrier->releaseCount) {
SetEvent(barrier->barrierEvent);
}
}
// Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) {
WaitForSingleObject(barrier->barrierEvent, INFINITE);
}
// Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) {}
#else
// Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
pthread_t thread;
pthread_create(&thread, NULL, func, data);
return thread;
}
// Wait for thread to finish
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
// Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) {
for (int i = 0; i < num; i++) {
cutEndThread(threads[i]);
}
}
// Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) {
CUTBarrier barrier;
barrier.count = 0;
barrier.releaseCount = releaseCount;
pthread_mutex_init(&barrier.mutex, 0);
pthread_cond_init(&barrier.conditionVariable, 0);
return barrier;
}
// Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) {
int myBarrierCount;
pthread_mutex_lock(&barrier->mutex);
myBarrierCount = ++barrier->count;
pthread_mutex_unlock(&barrier->mutex);
if (myBarrierCount >= barrier->releaseCount) {
pthread_cond_signal(&barrier->conditionVariable);
}
}
// Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) {
pthread_mutex_lock(&barrier->mutex);
while (barrier->count < barrier->releaseCount) {
pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
}
pthread_mutex_unlock(&barrier->mutex);
}
// Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) {
pthread_mutex_destroy(&barrier->mutex);
pthread_cond_destroy(&barrier->conditionVariable);
}
#endif
Callback.cu
// System includes
#include <stdio.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include "multithreading.h"
const int N_workloads = 8;
const int N_elements_per_workload = 100000;
CUTBarrier thread_barrier;
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
void *data);
struct heterogeneous_workload {
int id;
int cudaDeviceID;
int *h_data;
int *d_data;
cudaStream_t stream;
bool success;
};
__global__ void incKernel(int *data, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) data[i]++;
}
CUT_THREADPROC launch(void *void_arg) {
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// Allocate Resources
checkCudaErrors(cudaStreamCreate(&workload->stream));
checkCudaErrors(
cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
checkCudaErrors(cudaHostAlloc(&workload->h_data,
N_elements_per_workload * sizeof(int),
cudaHostAllocPortable));
// CPU thread generates data
for (int i = 0; i < N_elements_per_workload; ++i) {
workload->h_data[i] = workload->id + i;
}
// Schedule work for GPU in CUDA stream without blocking the CPU thread
// Note: Dedicated streams enable concurrent execution of workloads on the GPU
dim3 block(512);
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyHostToDevice, workload->stream));
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
N_elements_per_workload);
checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyDeviceToHost, workload->stream));
// New in CUDA 5.0: Add a CPU callback which is called once all currently
// pending operations in the CUDA stream have finished
checkCudaErrors(
cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
CUT_THREADEND;
// CPU thread end of life, GPU continues to process data...
}
CUT_THREADPROC postprocess(void *void_arg) {
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// ... GPU is done with processing, continue on new CPU thread...
// Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// CPU thread consumes results from GPU
workload->success = true;
for (int i = 0; i < N_workloads; ++i) {
workload->success &= workload->h_data[i] == i + workload->id + 1;
}
// Free Resources
checkCudaErrors(cudaFree(workload->d_data));
checkCudaErrors(cudaFreeHost(workload->h_data));
checkCudaErrors(cudaStreamDestroy(workload->stream));
// Signal the end of the heterogeneous workload to main thread
cutIncrementBarrier(&thread_barrier);
CUT_THREADEND;
}
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
void *data) {
// Check status of GPU after stream operations are done
checkCudaErrors(status);
// Spawn new CPU worker thread and continue processing on the CPU
cutStartThread(postprocess, data);
}
int main(int argc, char **argv) {
int N_gpus, max_gpus = 0;
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
printf("Starting simpleCallback\n");
checkCudaErrors(cudaGetDeviceCount(&N_gpus));
printf("Found %d CUDA capable GPUs\n", N_gpus);
if (N_gpus > 32) {
printf("simpleCallback only supports 32 GPU(s)\n");
}
for (int devid = 0; devid < N_gpus; devid++) {
int SMversion;
cudaDeviceProp deviceProp;
cudaSetDevice(devid);
cudaGetDeviceProperties(&deviceProp, devid);
SMversion = deviceProp.major << 4 + deviceProp.minor;
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
deviceProp.major, deviceProp.minor);
printf(", %s GPU Callback Functions\n",
(SMversion >= 0x11) ? "capable" : "NOT capable");
if (SMversion >= 0x11) {
gpuInfo[max_gpus++] = devid;
}
}
printf("%d GPUs available to run Callback Functions\n", max_gpus);
heterogeneous_workload *workloads;
workloads = (heterogeneous_workload *)malloc(N_workloads *
sizeof(heterogeneous_workload));
;
thread_barrier = cutCreateBarrier(N_workloads);
// Main thread spawns a CPU worker thread for each heterogeneous workload
printf("Starting %d heterogeneous computing workloads\n", N_workloads);
for (int i = 0; i < N_workloads; ++i) {
workloads[i].id = i;
workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
cutStartThread(launch, &workloads[i]);
}
// Sleep until all workloads have finished
cutWaitForBarrier(&thread_barrier);
printf("Total of %d workloads finished:\n", N_workloads);
bool success = true;
for (int i = 0; i < N_workloads; ++i) {
success &= workloads[i].success;
}
printf("%s\n", success ? "Success" : "Failure");
free(workloads);
exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
}
该博文为原创文章,未经博主同意不得转。
本文章博客地址:https://cplusplus.blog.csdn.net/article/details/128395568