0% found this document useful (0 votes)

1 views

vertopal.com_Lab7_GPU (1)

This document discusses GPU implementations of matrix multiplication, focusing on optimization techniques such as naive implementation, shared memory tiling, and register tiling. Each method is designed to minimize global memory access and enhance computation throughput. The document provides code examples and performance results for each approach, demonstrating the efficiency improvements achieved through these optimizations.

Uploaded by

Amal

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

1 views

vertopal.com_Lab7_GPU (1)

Uploaded by

Amal

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 10

Matrix Multiplication on GPU: Optimizations and Insights

In this notebook, we explore GPU implementations of matrix

multiplication. We analyze various optimization techniques including:

- Naive implementation
- Shared memory tiling
- Register tiling (+ shared memory)

Each method aims to reduce global memory access and increase computation
throughput.

Preparing the required packages and initialisations

pip install nvcc4jupyter

Collecting nvcc4jupyter
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1

%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...

Source files will be saved in "/tmp/tmprk8r5l8j".

from nvcc4jupyter import set_defaults

set_defaults(compiler_args="-arch=sm_75")

## Setting the matrix size

MATRIX_SIZE=8192
SIZE_FILE = open("matrix.size", "w")
SIZE_FILE.write(str(MATRIX_SIZE))
SIZE_FILE.close()

Naive implementation

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

global void matMulKernel(float A, float B, float *C, int N) {

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < N && col < N) {

float sum = 0.0f;
for (int k = 0; k < N; ++k) {
sum += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

cudaEvent_t start, stop;

CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;

float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {

A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();

if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Naive: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("naive_time.time"); // No append flag = overwrite

if (out.is_open()) {
out << "Naive: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Naive: N = 8192, kernel time = 2590.08 ms

Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Shared Memory Tiling

This kernel uses shared memory to reduce redundant loads from global
memory. Each block loads a tile of A and B into shared memory, computes
the partial result, then synchronizes before moving to the next tile.

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define TILE_SIZE 16

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

global void matMulKernel(float A, float B, float *C, int N) {

__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];

int row = blockIdx.y * TILE_SIZE + threadIdx.y;

int col = blockIdx.x * TILE_SIZE + threadIdx.x;

float val = 0.0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

if (row < N && t * TILE_SIZE + threadIdx.x < N)
tileA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE +
threadIdx.x];
else
tileA[threadIdx.y][threadIdx.x] = 0;

if (t * TILE_SIZE + threadIdx.y < N && col < N)

tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) *
N + col];
else
tileB[threadIdx.y][threadIdx.x] = 0;

__syncthreads();

for (int i = 0; i < TILE_SIZE; ++i)

val += tileA[threadIdx.y][i] * tileB[i][threadIdx.x];

__syncthreads();
}

if (row < N && col < N)

C[row * N + col] = val;
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

cudaEvent_t start, stop;

CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));
matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;

float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {

A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();

if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Shared: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("shared_time.time"); // No append flag = overwrite

if (out.is_open()) {
out << "Shared: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Shared: N = 8192, kernel time = 1690.28 ms

Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Instead of computing one output per thread, each thread computes a small
tile of outputs (using registers). This leverages fast register access.

%%cuda
#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#include <fstream>

#define TILE_SIZE 16

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

if (code != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(code)
<< " " << file << ":" << line << std::endl;
exit(code);
}
}

global void matMulKernel(const float* A, const float* B, float* C, int N) {

__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];

// Compute global row/col for 2x2 tile

int row = blockIdx.y * TILE_SIZE + threadIdx.y * 2;
int col = blockIdx.x * TILE_SIZE + threadIdx.x * 2;

float c00 = 0, c01 = 0, c10 = 0, c11 = 0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

// Load 2 rows of A into shared memory
for (int i = 0; i < 2; ++i) {
int r = row + i;
int c = t * TILE_SIZE + threadIdx.x * 2;
if (r < N && c < N) tileA[threadIdx.y * 2 + i][threadIdx.x * 2]
= A[r * N + c];
else tileA[threadIdx.y * 2 + i][threadIdx.x * 2]
= 0.0f;
if (r < N && c + 1 < N) tileA[threadIdx.y * 2 + i][threadIdx.x * 2
+ 1] = A[r * N + c + 1];
else tileA[threadIdx.y * 2 + i][threadIdx.x * 2
+ 1] = 0.0f;
}
// Load 2 columns of B into shared memory
for (int i = 0; i < 2; ++i) {
int r = t * TILE_SIZE + threadIdx.y * 2;
int c = col + i;
if (r < N && c < N) tileB[threadIdx.y * 2][threadIdx.x * 2 + i]
= B[r * N + c];
else tileB[threadIdx.y * 2][threadIdx.x * 2 + i]
= 0.0f;
if (r + 1 < N && c < N) tileB[threadIdx.y * 2 + 1][threadIdx.x * 2
+ i] = B[(r + 1) * N + c];
else tileB[threadIdx.y * 2 + 1][threadIdx.x * 2
+ i] = 0.0f;
}

__syncthreads();

// Multiply shared memory tiles

for (int k = 0; k < TILE_SIZE; ++k) {
float a0 = tileA[threadIdx.y * 2][k];
float a1 = tileA[threadIdx.y * 2 + 1][k];
float b0 = tileB[k][threadIdx.x * 2];
float b1 = tileB[k][threadIdx.x * 2 + 1];

c00 += a0 * b0;
c01 += a0 * b1;
c10 += a1 * b0;
c11 += a1 * b1;
}

__syncthreads();
}

// Write back to global memory

if (row < N && col < N) {
C[row * N + col] = c00;
if (col + 1 < N) C[row * N + col + 1] = c01;
if (row + 1 < N) C[(row + 1) * N + col] = c10;
if (row + 1 < N && col + 1 < N) C[(row + 1) * N + col + 1] = c11;
}
}

void matMul(float *A, float *B, float *C, int N, float &ms) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);

CUDA_CHECK(cudaMalloc(&d_A, size));
CUDA_CHECK(cudaMalloc(&d_B, size));
CUDA_CHECK(cudaMalloc(&d_C, size));

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(TILE_SIZE / 2, TILE_SIZE / 2);

dim3 numBlocks((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) /
TILE_SIZE);

cudaEvent_t start, stop;

CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUDA_CHECK(cudaEventDestroy(start));
CUDA_CHECK(cudaEventDestroy(stop));
}

int main(int argc, char **argv) {

std::ifstream inSize("matrix.size");
if (inSize.is_open() == false) {
std::cerr << "Unable to open input file!" << std::endl;
return 1;
}

int N;
inSize >> N;

const int size = N * N;

float *A = new float[size];
float *B = new float[size];
float *C = new float[size];

for (int i = 0; i < size; ++i) {

A[i] = static_cast<float>(rand()) / RAND_MAX;
B[i] = static_cast<float>(rand()) / RAND_MAX;
}

float ms = 0;
matMul(A, B, C, N, ms);

cudaError_t err = cudaGetLastError();

if (err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl;
}

std::cout << "Register: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;

std::ofstream out("register_time.time"); // No append flag = overwrite

if (out.is_open()) {
out << "Register: N = " << N << ", kernel time = " << ms << " ms" <<
std::endl;
out.close();
} else {
std::cerr << "Unable to open output file!" << std::endl;
}

std::cout << "Sample outputs: C[0][0] = " << C[0] << ", C[N/2][N/2] = "
<< C[(N/2) * N + (N/2)] << ", C[N-1][N-1] = " << C[(N-1)*N + (N-
1)] << std::endl;

delete[] A;
delete[] B;
delete[] C;
return 0;
}

Register: N = 8192, kernel time = 1625.12 ms

Sample outputs: C[0][0] = 2052.07, C[N/2][N/2] = 2059.19, C[N-1][N-1] = 2070.76

Benchmarking

import re
import matplotlib.pyplot as plt

# Files you want to parse

time_files = [
"naive_time.time",
"shared_time.time",
"register_time.time",
]

timings = []

# Extract label and time from each file

for file in time_files:
with open(file, 'r') as f:
content = f.read()
match = re.search(r"^\s*([A-Za-z0-9+ ]+):.*?kernel time\s*=\s*([0-
9.]+)", content)
if match:
label = match.group(1)
time = float(match.group(2))
timings.append((label, time))
else:
print(f"Warning: Couldn't parse {file}")

# Unzip
labels, times = zip(*timings)

# Plotting
plt.figure(figsize=(8, 5))
bars = plt.barh(labels, times)
plt.xlabel("Execution Time (ms)")
plt.title("GPU Matrix Multiplication Time Comparison")
plt.gca().invert_yaxis()

# Annotate bars
for bar, t in zip(bars, times):
plt.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
f"{t:.2f} ms", va='center')

plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.grid(True)
plt.show()

[]

Continuum Plain English Reference Manual
No ratings yet
Continuum Plain English Reference Manual
740 pages
PO Test 1
No ratings yet
PO Test 1
87 pages
Distributed Database Concepts
No ratings yet
Distributed Database Concepts
35 pages
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
No ratings yet
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
39 pages
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
No ratings yet
Introduction To Systems Analysis and Design:: An Agile, Iterative Approach
49 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
Rishi
No ratings yet
Rishi
30 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
more cpp23 lib examples
No ratings yet
more cpp23 lib examples
6 pages
C++ 4th Sem Pgms
No ratings yet
C++ 4th Sem Pgms
24 pages
CUDA MatrixMultiplication
No ratings yet
CUDA MatrixMultiplication
2 pages
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
No ratings yet
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
8 pages
Matrix Mult
100% (1)
Matrix Mult
55 pages
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
COLab File
No ratings yet
COLab File
24 pages
C++ Assign
No ratings yet
C++ Assign
146 pages
Computer Science Project File IMW Car Comparison Software
No ratings yet
Computer Science Project File IMW Car Comparison Software
31 pages
Cryptography and Network Security: Practical File
No ratings yet
Cryptography and Network Security: Practical File
22 pages
Oop Lab 7 Fa19-Bee-170
No ratings yet
Oop Lab 7 Fa19-Bee-170
35 pages
DS FI
No ratings yet
DS FI
27 pages
Reporte
No ratings yet
Reporte
12 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
Submitted To: Submited by
No ratings yet
Submitted To: Submited by
45 pages
Computer Practical File: Submitted By: Rohan Pradhan Class: Xii A
No ratings yet
Computer Practical File: Submitted By: Rohan Pradhan Class: Xii A
42 pages
CN LAB
No ratings yet
CN LAB
22 pages
Assignment 7
No ratings yet
Assignment 7
19 pages
Lab 9-14
No ratings yet
Lab 9-14
52 pages
Omnetpp PDF
No ratings yet
Omnetpp PDF
33 pages
Computer Network Lab Manual 2024
No ratings yet
Computer Network Lab Manual 2024
11 pages
Azdocuments
No ratings yet
Azdocuments
23 pages
Computer Network Lab Manual 2024 5 Pro
No ratings yet
Computer Network Lab Manual 2024 5 Pro
16 pages
OOPS Practcal Code
No ratings yet
OOPS Practcal Code
16 pages
Lab Week9 Report CP
No ratings yet
Lab Week9 Report CP
7 pages
4. Cuda Add Mult
No ratings yet
4. Cuda Add Mult
3 pages
Program File
No ratings yet
Program File
68 pages
Program: / Implementing Class With Static Data Member
No ratings yet
Program: / Implementing Class With Static Data Member
49 pages
Program To Calculate 100!
No ratings yet
Program To Calculate 100!
25 pages
Code Topic Wise C+++
No ratings yet
Code Topic Wise C+++
5 pages
Print First
No ratings yet
Print First
28 pages
Assignment No: 2 Object Oriented Programming
No ratings yet
Assignment No: 2 Object Oriented Programming
51 pages
CPP Lab PDF
No ratings yet
CPP Lab PDF
35 pages
Program 9: Create A Database Regarding Its Indoor Patients
No ratings yet
Program 9: Create A Database Regarding Its Indoor Patients
13 pages
Mimansa Pathania, CG Lab File, 500091628-2
No ratings yet
Mimansa Pathania, CG Lab File, 500091628-2
25 pages
lab TASK 1-2
No ratings yet
lab TASK 1-2
21 pages
CPP_Practical
No ratings yet
CPP_Practical
29 pages
CUDA - MonteCarloPi Code
No ratings yet
CUDA - MonteCarloPi Code
6 pages
DU Computer Networks Practicals
No ratings yet
DU Computer Networks Practicals
44 pages
New Microprocessor 1-Rec
No ratings yet
New Microprocessor 1-Rec
14 pages
Coverage
No ratings yet
Coverage
22 pages
TPG201T - S2 - 2022 - Fa2 (Memo) +SD
No ratings yet
TPG201T - S2 - 2022 - Fa2 (Memo) +SD
3 pages
Project in C++ (Banking Management System)
67% (55)
Project in C++ (Banking Management System)
24 pages
CN Lab Manual R2013
No ratings yet
CN Lab Manual R2013
38 pages
CS Practicle File
No ratings yet
CS Practicle File
53 pages
NCS 751 Lab Manual
No ratings yet
NCS 751 Lab Manual
16 pages
OS Assignment
No ratings yet
OS Assignment
29 pages
WAP in C++ To Find The Shortest Path Between Source and Destination Using Warshall's Algorithm
No ratings yet
WAP in C++ To Find The Shortest Path Between Source and Destination Using Warshall's Algorithm
4 pages
OOP Lab Assignments
No ratings yet
OOP Lab Assignments
18 pages
Name: Dhruvil K Kotecha ID No.: 17CP024 Sub. Code: CP-402 Sub. Name: ADT Semester: 7 Year: 2020/21
No ratings yet
Name: Dhruvil K Kotecha ID No.: 17CP024 Sub. Code: CP-402 Sub. Name: ADT Semester: 7 Year: 2020/21
30 pages
In-Depth Network Programming in C++
No ratings yet
In-Depth Network Programming in C++
5 pages
Oops Lab Manual
No ratings yet
Oops Lab Manual
41 pages
Vaibhavi Practical File
No ratings yet
Vaibhavi Practical File
50 pages
Non Elab Week 11
No ratings yet
Non Elab Week 11
8 pages
networklabstiya
No ratings yet
networklabstiya
16 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
vertopal.com_lab6
No ratings yet
vertopal.com_lab6
29 pages
Vertopal.com HW4ML Project Code
No ratings yet
Vertopal.com HW4ML Project Code
24 pages
ece265p-fahmy-day7
No ratings yet
ece265p-fahmy-day7
93 pages
Chapter 7 SOA For Business Automation
No ratings yet
Chapter 7 SOA For Business Automation
25 pages
Chapter 12
No ratings yet
Chapter 12
61 pages
XML: Extensible Markup Language
No ratings yet
XML: Extensible Markup Language
35 pages
The Role of Multichannel Integration in CRM: Adrian Payne and Pennie Frow
No ratings yet
The Role of Multichannel Integration in CRM: Adrian Payne and Pennie Frow
9 pages
Chapter19 v2
No ratings yet
Chapter19 v2
54 pages
03 0478 22 MS Prov Rma 01032023030542
No ratings yet
03 0478 22 MS Prov Rma 01032023030542
16 pages
Practical 12
No ratings yet
Practical 12
8 pages
CS 320: Concepts of Programming Languages: Wayne Snyder Computer Science Department Boston University
100% (1)
CS 320: Concepts of Programming Languages: Wayne Snyder Computer Science Department Boston University
22 pages
12 CS - SQP
No ratings yet
12 CS - SQP
5 pages
Create An HTML Form For File Upload
No ratings yet
Create An HTML Form For File Upload
3 pages
Documentation Peeper
No ratings yet
Documentation Peeper
111 pages
Creating Libraries in HITECH
No ratings yet
Creating Libraries in HITECH
2 pages
Availability and Reachability
No ratings yet
Availability and Reachability
1 page
Jawaharlal Nehru College For Women - Ulundurpet Teaching Daily Work Done Report
No ratings yet
Jawaharlal Nehru College For Women - Ulundurpet Teaching Daily Work Done Report
13 pages
Log
No ratings yet
Log
8 pages
Pm Sri Kendriya Vidyalaya
No ratings yet
Pm Sri Kendriya Vidyalaya
6 pages
19 - PHP-MVC-Frameworks-REST Api-Lab
No ratings yet
19 - PHP-MVC-Frameworks-REST Api-Lab
12 pages
Francesca Lazzeri - Machine Learning For Time Series Forecasting With Python-Wiley (2020) (177-206)
No ratings yet
Francesca Lazzeri - Machine Learning For Time Series Forecasting With Python-Wiley (2020) (177-206)
30 pages
Test Project: Web Technologies PHP and Javascript Module
No ratings yet
Test Project: Web Technologies PHP and Javascript Module
31 pages
Third Quarter Examination in Tle 10
No ratings yet
Third Quarter Examination in Tle 10
3 pages
REVIEW - Power Bi Challenges
No ratings yet
REVIEW - Power Bi Challenges
15 pages
CS Practicals Xii 2021 22
No ratings yet
CS Practicals Xii 2021 22
18 pages
Tech M - F2F Qtns
No ratings yet
Tech M - F2F Qtns
3 pages
Assignment # 1: BS-Software Engineering Section X
No ratings yet
Assignment # 1: BS-Software Engineering Section X
34 pages
Unit Iv
No ratings yet
Unit Iv
36 pages
1 SDLC
No ratings yet
1 SDLC
16 pages
UVM Register Back Door Access
No ratings yet
UVM Register Back Door Access
7 pages
Anurag Thakur 64470 CA
No ratings yet
Anurag Thakur 64470 CA
10 pages
4HANA
No ratings yet
4HANA
16 pages
Client
No ratings yet
Client
11 pages
Lab 12
No ratings yet
Lab 12
18 pages
UNIX and Perl to the Rescue A Field Guide for the Life Sciences and Other Data rich Pursuits Keith Bradnam All Chapters Instant Download
100% (1)
UNIX and Perl to the Rescue A Field Guide for the Life Sciences and Other Data rich Pursuits Keith Bradnam All Chapters Instant Download
77 pages
Hà Minh Chí (BKC12327) - Assignment 1 lần 1 - SDLC
No ratings yet
Hà Minh Chí (BKC12327) - Assignment 1 lần 1 - SDLC
19 pages

vertopal.com_Lab7_GPU (1)

Uploaded by

vertopal.com_Lab7_GPU (1)

Uploaded by

Matrix Multiplication on GPU: Optimizations and Insights

In this notebook, we explore GPU implementations of matrix

Preparing the required packages and initialisations

pip install nvcc4jupyter

Detected platform "Colab". Running its setup...

from nvcc4jupyter import set_defaults

## Setting the matrix size

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

__global__ void matMulKernel(float *A, float *B, float *C, int N) {

if (row < N && col < N) {

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

cudaEvent_t start, stop;

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

int main(int argc, char **argv) {

const int size = N * N;

for (int i = 0; i < size; ++i) {

cudaError_t err = cudaGetLastError();

std::ofstream out("naive_time.time"); // No append flag = overwrite

Naive: N = 8192, kernel time = 2590.08 ms

Shared Memory Tiling

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

__global__ void matMulKernel(float *A, float *B, float *C, int N) {

int row = blockIdx.y * TILE_SIZE + threadIdx.y;

float val = 0.0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

if (t * TILE_SIZE + threadIdx.y < N && col < N)

for (int i = 0; i < TILE_SIZE; ++i)

if (row < N && col < N)

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(16, 16);

cudaEvent_t start, stop;

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

int main(int argc, char **argv) {

const int size = N * N;

for (int i = 0; i < size; ++i) {

cudaError_t err = cudaGetLastError();

std::ofstream out("shared_time.time"); // No append flag = overwrite

Shared: N = 8192, kernel time = 1690.28 ms

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line) {

__global__ void matMulKernel(const float* A, const float* B, float* C, int N) {

// Compute global row/col for 2x2 tile

float c00 = 0, c01 = 0, c10 = 0, c11 = 0;

for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; ++t) {

// Multiply shared memory tiles

// Write back to global memory

CUDA_CHECK(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));

dim3 threadsPerBlock(TILE_SIZE / 2, TILE_SIZE / 2);

cudaEvent_t start, stop;

matMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

CUDA_CHECK(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));

int main(int argc, char **argv) {

const int size = N * N;

for (int i = 0; i < size; ++i) {

cudaError_t err = cudaGetLastError();

std::ofstream out("register_time.time"); // No append flag = overwrite

Register: N = 8192, kernel time = 1625.12 ms

# Files you want to parse

# Extract label and time from each file

plt.grid(axis='x', linestyle='--', alpha=0.6)

You might also like

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

global void matMulKernel(float A, float B, float *C, int N) {

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

global void matMulKernel(float A, float B, float *C, int N) {

#define CUDA_CHECK(ans) { gpuAssert((ans), FILE, LINE); }

global void matMulKernel(const float* A, const float* B, float* C, int N) {