0% found this document useful (0 votes)
1 views

vertopal.com_Lab7_GPU (1)

This document discusses GPU implementations of matrix multiplication, focusing on optimization techniques such as naive implementation, shared memory tiling, and register tiling. Each method is designed to minimize global memory access and enhance computation throughput. The document provides code examples and performance results for each approach, demonstrating the efficiency improvements achieved through these optimizations.

Uploaded by

Amal
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views

vertopal.com_Lab7_GPU (1)

This document discusses GPU implementations of matrix multiplication, focusing on optimization techniques such as naive implementation, shared memory tiling, and register tiling. Each method is designed to minimize global memory access and enhance computation throughput. The document provides code examples and performance results for each approach, demonstrating the efficiency improvements achieved through these optimizations.

Uploaded by

Amal
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 10

Matrix Multiplication on GPU: Optimizations and Insights

In this notebook, we explore GPU implementations of matrix


multiplication. We analyze various optimization techniques including:

- Naive implementation
- Shared memory tiling
- Register tiling (+ shared memory)

Each method aims to reduce global memory access and increase computation
throughput.

Preparing the required packages and initialisations

pip install nvcc4jupyter

Collecting nvcc4jupyter
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1

%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...


Source files will be saved in "/tmp/tmprk8r5l8j".

from nvcc4jupyter import set_defaults


set_defaults(compiler_args="-arch=sm_75")

## Setting the matrix size

MATRIX_SIZE=8192
SIZE_FILE = open("matrix.size", "w")
SIZE_FILE.write(str(MATRIX_SIZE))
SIZE_FILE.close()

Naive implementation

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {


if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

__global__ void matMulKernel(float *A, float *B, float *C, int N) {


int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < N && col < N) {


float sum = 0.0f;
for (int k = 0; k < N; ++k) {
sum += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));


CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);


dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

cudaEvent_t start, stop;


CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;


float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {


A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();


if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Naive: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("naive_time.time"); // No append flag = overwrite


if (out.is_open()) {
out << "Naive: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Naive: N = 8192, kernel time = 2590.08 ms


Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Shared Memory Tiling

This kernel uses shared memory to reduce redundant loads from global
memory. Each block loads a tile of A and B into shared memory, computes
the partial result, then synchronizes before moving to the next tile.

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define TILE_SIZE 16

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {


if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

__global__ void matMulKernel(float *A, float *B, float *C, int N) {


__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];

int row = blockIdx.y * TILE_SIZE + threadIdx.y;


int col = blockIdx.x * TILE_SIZE + threadIdx.x;

float val = 0.0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {


if (row < N && t * TILE_SIZE + threadIdx.x < N)
tileA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE +
threadIdx.x];
else
tileA[threadIdx.y][threadIdx.x] = 0;

if (t * TILE_SIZE + threadIdx.y < N && col < N)


tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) *
N + col];
else
tileB[threadIdx.y][threadIdx.x] = 0;

__syncthreads();

for (int i = 0; i < TILE_SIZE; ++i)


val += tileA[threadIdx.y][i] * tileB[i][threadIdx.x];

__syncthreads();
}

if (row < N && col < N)


C[row * N + col] = val;
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));


CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);


dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

cudaEvent_t start, stop;


CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));
matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;


float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {


A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();


if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Shared: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("shared_time.time"); // No append flag = overwrite


if (out.is_open()) {
out << "Shared: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Shared: N = 8192, kernel time = 1690.28 ms


Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Register Tiling

Instead of computing one output per thread, each thread computes a small
tile of outputs (using registers). This leverages fast register access.

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define TILE_SIZE 16

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {


if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

__global__ void matMulKernel(const float* A, const float* B, float* C, int N) {


__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];

// Compute global row/col for 2x2 tile


int row = blockIdx.y * TILE_SIZE + threadIdx.y * 2;
int col = blockIdx.x * TILE_SIZE + threadIdx.x * 2;

float c00 = 0, c01 = 0, c10 = 0, c11 = 0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {


// Load 2 rows of A into shared memory
for (int i = 0; i < 2; ++i) {
int r = row + i;
int c = t * TILE_SIZE + threadIdx.x * 2;
if (r < N && c < N) tileA[threadIdx.y * 2 + i][threadIdx.x * 2]
= A[r * N + c];
else tileA[threadIdx.y * 2 + i][threadIdx.x * 2]
= 0.0f;
if (r < N && c + 1 < N) tileA[threadIdx.y * 2 + i][threadIdx.x * 2
+ 1] = A[r * N + c + 1];
else tileA[threadIdx.y * 2 + i][threadIdx.x * 2
+ 1] = 0.0f;
}
// Load 2 columns of B into shared memory
for (int i = 0; i < 2; ++i) {
int r = t * TILE_SIZE + threadIdx.y * 2;
int c = col + i;
if (r < N && c < N) tileB[threadIdx.y * 2][threadIdx.x * 2 + i]
= B[r * N + c];
else tileB[threadIdx.y * 2][threadIdx.x * 2 + i]
= 0.0f;
if (r + 1 < N && c < N) tileB[threadIdx.y * 2 + 1][threadIdx.x * 2
+ i] = B[(r + 1) * N + c];
else tileB[threadIdx.y * 2 + 1][threadIdx.x * 2
+ i] = 0.0f;
}

__syncthreads();

// Multiply shared memory tiles


for (int k = 0; k < TILE_SIZE; ++k) {
float a0 = tileA[threadIdx.y * 2][k];
float a1 = tileA[threadIdx.y * 2 + 1][k];
float b0 = tileB[k][threadIdx.x * 2];
float b1 = tileB[k][threadIdx.x * 2 + 1];

c00 += a0 * b0;
c01 += a0 * b1;
c10 += a1 * b0;
c11 += a1 * b1;
}

__syncthreads();
}

// Write back to global memory


if (row < N && col < N) {
C[row * N + col] = c00;
if (col + 1 < N) C[row * N + col + 1] = c01;
if (row + 1 < N) C[(row + 1) * N + col] = c10;
if (row + 1 < N && col + 1 < N) C[(row + 1) * N + col + 1] = c11;
}
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));


CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(TILE_SIZE / 2, TILE_SIZE / 2);


dim3 numBlocks((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) /
TILE_SIZE);

cudaEvent_t start, stop;


CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;


float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {


A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();


if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Register: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("register_time.time"); // No append flag = overwrite


if (out.is_open()) {
out << "Register: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Register: N = 8192, kernel time = 1625.12 ms


Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Benchmarking

import re
import matplotlib.pyplot as plt

# Files you want to parse


time_files = [
"naive_time.time",
"shared_time.time",
"register_time.time",
]

timings = []

# Extract label and time from each file


for file in time_files:
with open(file, 'r') as f:
content = f.read()
match = re.search(r"^\s*([A-Za-z0-9+ ]+):.*?kernel time\s*=\s*([0-
9.]+)", content)
if match:
label = match.group(1)
time = float(match.group(2))
timings.append((label, time))
else:
print(f"Warning: Couldn't parse {file}")

# Unzip
labels, times = zip(*timings)

# Plotting
plt.figure(figsize=(8, 5))
bars = plt.barh(labels, times)
plt.xlabel("Execution Time (ms)")
plt.title("GPU Matrix Multiplication Time Comparison")
plt.gca().invert_yaxis()

# Annotate bars
for bar, t in zip(bars, times):
plt.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
f"{t:.2f} ms", va='center')

plt.grid(axis='x', linestyle='--', alpha=0.6)


plt.tight_layout()
plt.grid(True)
plt.show()

[]

You might also like