Data Parallelism, Task Parallelism, CPU,GPU (1)
Data Parallelism, Task Parallelism, CPU,GPU (1)
analysis , Terminology
Group Member
Maasma Zari 2022-CS-
504
Zameer ul Hassan 2022-
CS-540
Haris Khan 2022-CS-
556
What is a CPU?
parallel tasks.
•Originally built for rendering
graphics.
•Now used in AI, scientific
simulations, video editing, etc.
•Can handle thousands of simple
tasks at the same time.
Architecture of GPU
__global__ void addKernelGPU(int *c, const int *a, const int *b, int n) { cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
int i = blockDim.x * blockIdx.x + threadIdx.x; cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
if (i < n) {
c[i] = a[i] + b[i]; // CPU Execution
} auto start_cpu = std::chrono::high_resolution_clock::now();
} addCPU(c_cpu, a, b, N);
auto end_cpu = std::chrono::high_resolution_clock::now();
void addCPU(int *c, const int *a, const int *b, int n) { std::chrono::duration<double> cpu_duration = end_cpu - start_cpu;
for (int i = 0; i < n; ++i) { std::cout << "CPU Time: " << cpu_duration.count() << " seconds" <<
c[i] = a[i] + b[i]; std::endl;
}
} // GPU Execution
auto start_gpu = std::chrono::high_resolution_clock::now();
int main() { addKernelGPU<<<(N + 255) / 256, 256>>>(dev_c, dev_a, dev_b, N);
const int N = 1 << 20; // 1 million elements cudaMemcpy(c_gpu, dev_c, size, cudaMemcpyDeviceToHost);
size_t size = N * sizeof(int); auto end_gpu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> gpu_duration = end_gpu - start_gpu;
int *a = new int[N]; std::cout << "GPU Time: " << gpu_duration.count() << " seconds" <<
int *b = new int[N]; std::endl;
int *c_cpu = new int[N];
int *c_gpu = new int[N]; // Cleanup
cudaFree(dev_a);
for (int i = 0; i < N; ++i) { cudaFree(dev_b);
a[i] = i; cudaFree(dev_c);
b[i] = i * 2; delete[] a;
} delete[] b;
delete[] c_cpu;
int *dev_a = nullptr; delete[] c_gpu;
int *dev_b = nullptr;
int *dev_c = nullptr; return 0;
Output: