Google Colab Solution Activity
Google Colab Solution Activity
Activity Solution
DR. RACHAD ATAT
In-Class Activity
Modify the vecAdd program so that each thread adds two adjacent elements from the input
vectors A and B and stores the result in the corresponding two adjacent elements of the output
vector C.
What you need to do: Update the Kernel: Modify the vecAddKernel so that each thread
processes two adjacent elements instead of just one.
2
#include <stdio.h> // Host function to setup and call the vecAddKernel
#include <cuda.h> void vecAdd(float* A, float* B, float* C, int n) {
float *A_d, *B_d, *C_d;
// Kernel function to perform vector addition int size = n * sizeof(float);
__global__
void vecAddKernel(float* A, float* B, float* C, int n) { // Allocate memory on the device (GPU)
int i = threadIdx.x + blockDim.x * blockIdx.x; cudaMalloc((void **) &A_d, size);
cudaMalloc((void **) &B_d, size);
// Compute the index for the first element cudaMalloc((void **) &C_d, size);
// Compute the index for the second element
// Ensure both indices are within bounds before performing // Copy vectors A and B from host (CPU) to device (GPU)
the operation cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
} cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
3
// Copy the result vector C from device (GPU) to host (CPU)
cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);
int main() {
int n = 1000; // Size of the vectors
float A[n], B[n], C[n]; // Declare host vectors
// Call the vecAdd function to perform the vector addition on the GPU
vecAdd(A, B, C, n);
return 0;
}
4
#include <stdio.h> // Host function to setup and call the vecAddKernel
#include <cuda.h> void vecAdd(float* A, float* B, float* C, int n) {
float *A_d, *B_d, *C_d;
// Kernel function to perform vector addition int size = n * sizeof(float);
__global__
void vecAddKernel(float* A, float* B, float* C, int n) { // Allocate memory on the device (GPU)
int i = threadIdx.x + blockDim.x * blockIdx.x; cudaMalloc((void **) &A_d, size);
cudaMalloc((void **) &B_d, size);
// Each thread now processes two adjacent elements cudaMalloc((void **) &C_d, size);
int idx1 = 2 * i;
int idx2 = 2 * i + 1; // Copy vectors A and B from host (CPU) to device (GPU)
cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
// Ensure both indices are within bounds before performing cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
the operation
if (idx1 < n) { // Launch the kernel
C[idx1] = A[idx1] + B[idx1]; // Since each thread handles two elements, we need n/2
} threads, so we launch (n + 511)/512 blocks with 256 threads
if (idx2 < n) { each
C[idx2] = A[idx2] + B[idx2]; vecAddKernel<<<(n + 511)/512, 256>>>(A_d, B_d, C_d, n);
}
}
SOLUTION
5