0% found this document useful (0 votes)
19 views

Google Colab Solution Activity

Uploaded by

omarobeidd03
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
19 views

Google Colab Solution Activity

Uploaded by

omarobeidd03
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PPTX, PDF, TXT or read online on Scribd
You are on page 1/ 5

Google Colab

Activity Solution
DR. RACHAD ATAT
In-Class Activity
Modify the vecAdd program so that each thread adds two adjacent elements from the input
vectors A and B and stores the result in the corresponding two adjacent elements of the output
vector C.
What you need to do: Update the Kernel: Modify the vecAddKernel so that each thread
processes two adjacent elements instead of just one.

2
#include <stdio.h> // Host function to setup and call the vecAddKernel
#include <cuda.h> void vecAdd(float* A, float* B, float* C, int n) {
float *A_d, *B_d, *C_d;
// Kernel function to perform vector addition int size = n * sizeof(float);
__global__
void vecAddKernel(float* A, float* B, float* C, int n) { // Allocate memory on the device (GPU)
int i = threadIdx.x + blockDim.x * blockIdx.x; cudaMalloc((void **) &A_d, size);
cudaMalloc((void **) &B_d, size);
// Compute the index for the first element cudaMalloc((void **) &C_d, size);
// Compute the index for the second element
// Ensure both indices are within bounds before performing // Copy vectors A and B from host (CPU) to device (GPU)
the operation cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
} cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);

// Launch the kernel


// Since each thread handles two elements, we need n/2
threads, so we launch (n + 511)/512 blocks with 256 threads
each
vecAddKernel<<<(n + 511)/512, 256>>>(A_d, B_d, C_d, n);

3
// Copy the result vector C from device (GPU) to host (CPU)
cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);

// Free the device memory


cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
}

int main() {
int n = 1000; // Size of the vectors
float A[n], B[n], C[n]; // Declare host vectors

// Initialize vectors A and B with some values


for (int i = 0; i < n; i++) {
A[i] = i * 1.0f;
B[i] = i * 2.0f;
}

// Call the vecAdd function to perform the vector addition on the GPU
vecAdd(A, B, C, n);

// Display a few results from the output


for (int i = 0; i < 10; i++) {
printf("C[%d] = %f\n", i, C[i]);
}

return 0;
}

4
#include <stdio.h> // Host function to setup and call the vecAddKernel
#include <cuda.h> void vecAdd(float* A, float* B, float* C, int n) {
float *A_d, *B_d, *C_d;
// Kernel function to perform vector addition int size = n * sizeof(float);
__global__
void vecAddKernel(float* A, float* B, float* C, int n) { // Allocate memory on the device (GPU)
int i = threadIdx.x + blockDim.x * blockIdx.x; cudaMalloc((void **) &A_d, size);
cudaMalloc((void **) &B_d, size);
// Each thread now processes two adjacent elements cudaMalloc((void **) &C_d, size);
int idx1 = 2 * i;
int idx2 = 2 * i + 1; // Copy vectors A and B from host (CPU) to device (GPU)
cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
// Ensure both indices are within bounds before performing cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
the operation
if (idx1 < n) { // Launch the kernel
C[idx1] = A[idx1] + B[idx1]; // Since each thread handles two elements, we need n/2
} threads, so we launch (n + 511)/512 blocks with 256 threads
if (idx2 < n) { each
C[idx2] = A[idx2] + B[idx2]; vecAddKernel<<<(n + 511)/512, 256>>>(A_d, B_d, C_d, n);
}
}
SOLUTION
5

You might also like