vertopal.com_Lab7_GPU (1)
vertopal.com_Lab7_GPU (1)
- Naive implementation
- Shared memory tiling
- Register tiling (+ shared memory)
Each method aims to reduce global memory access and increase computation
throughput.
Collecting nvcc4jupyter
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
%load_ext nvcc4jupyter
MATRIX_SIZE=8192
SIZE_FILE = open("matrix.size", "w")
SIZE_FILE.write(str(MATRIX_SIZE))
SIZE_FILE.close()
Naive implementation
%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>
void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);
CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}
std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}
int N;
inSize >> N;
float ms = 0;
matMul(A, B, C, N, ms);
std::cout << "Naive: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 0;
}
This kernel uses shared memory to reduce redundant loads from global
memory. Each block loads a tile of A and B into shared memory, computes
the partial result, then synchronizes before moving to the next tile.
%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>
#define TILE_SIZE 16
__syncthreads();
__syncthreads();
}
void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);
CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}
std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}
int N;
inSize >> N;
float ms = 0;
matMul(A, B, C, N, ms);
std::cout << "Shared: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 0;
}
Register Tiling
Instead of computing one output per thread, each thread computes a small
tile of outputs (using registers). This leverages fast register access.
%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>
#define TILE_SIZE 16
__syncthreads();
c00 += a0 * b0;
c01 += a0 * b1;
c10 += a1 * b0;
c11 += a1 * b1;
}
__syncthreads();
}
void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);
CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}
std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}
int N;
inSize >> N;
float ms = 0;
matMul(A, B, C, N, ms);
std::cout << "Register: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 0;
}
Benchmarking
import re
import matplotlib.pyplot as plt
timings = []
# Unzip
labels, times = zip(*timings)
# Plotting
plt.figure(figsize=(8, 5))
bars = plt.barh(labels, times)
plt.xlabel("Execution Time (ms)")
plt.title("GPU Matrix Multiplication Time Comparison")
plt.gca().invert_yaxis()
# Annotate bars
for bar, t in zip(bars, times):
plt.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
f"{t:.2f} ms", va='center')
[]