0% found this document useful (0 votes)

21 views4 pages

joint_matrix_bfloat16_modified

Uploaded by

donruffcorn

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

21 views4 pages

joint_matrix_bfloat16_modified

Uploaded by

donruffcorn

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 4

pip install lightning[extra]

//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==//

//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix

// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX=1

// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

#include <iostream>
#include <CL/sycl.hpp>
#include <ext/oneapi/experimental/bfloat16.hpp>
#include <ext/oneapi/matrix/matrix.hpp>

using namespace sycl;

using namespace sycl::ext::oneapi::experimental::matrix;
using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;

#define SG_SZ 8

#define TM 8
#define TN 8
#define TK 16

template <typename T, size_t NUM_ROWS, size_t NUM_COLS>

struct big_matrix
{
private:
T *mat;

public:
T *get_data() { return mat; }
void set_data(T *data) { mat = data; }
big_matrix(T *data) : mat(data) {}
};

template <typename T1, typename T2, size_t M, size_t N, size_t K>

void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
big_matrix<T2, K / 2, N * 2> &B)
{
size_t NDRangeM = M / TM;
size_t NDRangeN = N / TN;
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N));
buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));

sycl::default_selector d_selector;
sycl::queue q(d_selector);
// Print out the device information used for the kernel code.
std::cout << "Running on device: "
<< q.get_device().get_info<sycl::info::device::name>() << "\n";
try
{
q.submit([&](handler &cgh)
{
auto accC = bufC.get_access<access::mode::read_write>(cgh);
auto accA = bufA.get_access<access::mode::read_write>(cgh);
auto accB = bufB.get_access<access::mode::read_write>(cgh);

cgh.parallel_for<class imatrix>(
nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
[=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]

{
// The submatrix API has to be accessed by all the workitems in a
// subgroup these functions will be called once by the subgroup no
// code divergence between the workitems
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

ext::oneapi::sub_group sg = spmd_item.get_sub_group();
joint_matrix<bfloat16, TM, TK> sub_a(sg);
// For B, since current implementation does not support non-packed
// layout, users need to specify the updated VNNI sizes along with
// the packed_b layout. By default, the layout is row_major and size
// is (TK, TN).
joint_matrix<bfloat16, TK, TN, matrix_layout::packed_b> sub_b(sg);
joint_matrix<float, TM, TN> sub_c(sg);

joint_matrix_fill(sg, sub_c, 0);

for (int k = 0; k < K / TK; k += 1)
{ //
joint_matrix_load(
sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK,
K, matrix_layout::row_major);
// Assuming B data is already in VNNI format.
joint_matrix_load(sg, sub_b,
accB.get_pointer() + (k * TK / 2) * (N * 2) +
sg_starty / SG_SZ * TN * 2,
N * 2, matrix_layout::packed_b);
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
}
joint_matrix_store(sg, sub_c,
accC.get_pointer() + (sg_startx * TM) * N +
sg_starty / SG_SZ * TN,
N, matrix_layout::row_major);
}); // parallel for })
}).wait();
}
catch (std::exception const &e)
{
std::cout << "An exception was caught when performing AMX/XMX matrix multiply.\
n";
std::terminate();
}
}

static constexpr size_t MATRIX_M = TM * 128;

static constexpr size_t MATRIX_N = TN * 128;
static constexpr size_t MATRIX_K = TK * 128;
bfloat16 A[MATRIX_M][MATRIX_K];
bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
unsigned short Aref[MATRIX_M][MATRIX_K];
unsigned short Bref[MATRIX_K / 2][MATRIX_N * 2];
float C[MATRIX_M][MATRIX_N];
float D[MATRIX_M][MATRIX_N];

float make_fp32(short x)
{
unsigned int y = x;
y = y << 16;
float *res = reinterpret_cast<float *>(&y);
return *res;
}

unsigned short make_bf16(float x)

{
int *res = reinterpret_cast<int *>(&x);
*res = *res >> 16;
return (unsigned short)*res;
}

void matrix_multiply_ref(int A_mem, int B_mem, int *C_mem, int M, int N,

int K)
{
// tiling
for (int m = 0; m < M; m++)
for (int n = 0; n < N; n++)
{
for (int k = 0; k < K; k++)
{
short *va = (short *)(A_mem + m * K + k);
short *vb = (short *)(B_mem + k * N + n);
float acc = *((float *)(C_mem + m * N + n));
// FIXME: Should we do reduce-add in another version?
for (int i = 0; i < 2; i++)
{
acc += (make_fp32(va[i]) * make_fp32(vb[i]));
}
*((float *)(C_mem + m * N + n)) = acc;
}
}
}

void initialize_matrices()
{
for (int i = 0; i < MATRIX_M; i++)
{
for (int j = 0; j < MATRIX_K; j++)
{
// bfloat16 is created using unsigned short since conversion from float to
// bfloat16 is not supported on the host side yet
A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j)));
Aref[i][j] = make_bf16(1.0f * (i + j));
}
}
for (int i = 0; i < MATRIX_K / 2; i++)
{
for (int j = 0; j < MATRIX_N * 2; j++)
{
B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j)));
Bref[i][j] = make_bf16(2.0f * i + 3.0f * j);
}
}
for (int i = 0; i < MATRIX_M; i++)
{
for (int j = 0; j < MATRIX_N; j++)
{
C[i][j] = 1.0;
D[i][j] = 1.0;
}
}
}

int main()
{
initialize_matrices();

big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);

big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
big_matrix<bfloat16, MATRIX_K / 2, MATRIX_N * 2> MB((bfloat16 *)&B);

auto start = std::chrono::steady_clock::now();

matrix_multiply(MC, MA, MB);
auto end = std::chrono::steady_clock::now();
std::cout << "Elapsed time in milliseconds (accelerated): "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< " ms" << std::endl;

start = std::chrono::steady_clock::now();
matrix_multiply_ref((int32_t *)Aref, (int32_t *)Bref, (int32_t *)D, MATRIX_M,
MATRIX_N, MATRIX_K / 2);
end = std::chrono::steady_clock::now();
std::cout << "Elapsed time in milliseconds (reference): "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< " ms" << std::endl;

bool res = true;

for (int i = 0; i < MATRIX_M; i++)
{
for (int j = 0; j < MATRIX_N; j++)
{
if (abs(C[i][j] - D[i][j]) > C[i][j] / 1e5)
{
res = false;
}
}
}
if (res)
std::cout << "passed\n";
else
std::cout << "failed\n";
}

Lab 3
No ratings yet
Lab 3
1 page
Procurement Policies Template
No ratings yet
Procurement Policies Template
11 pages
The Tate Modern: London, United Kingdom
No ratings yet
The Tate Modern: London, United Kingdom
26 pages
Web GPU
0% (1)
Web GPU
40 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
Gauss
No ratings yet
Gauss
7 pages
Experiments With Cache-Oblivious Matrix Multiplication For 18.335 (Optimal) Cache-Oblivious Matrix Multiply
No ratings yet
Experiments With Cache-Oblivious Matrix Multiplication For 18.335 (Optimal) Cache-Oblivious Matrix Multiply
1 page
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
Assignment 04 (2)
No ratings yet
Assignment 04 (2)
16 pages
5-computation
No ratings yet
5-computation
13 pages
vertopal.com_Lab7_GPU (1)
No ratings yet
vertopal.com_Lab7_GPU (1)
10 pages
Matrix Multiplication Parallel
No ratings yet
Matrix Multiplication Parallel
5 pages
cuuda nvidai guide_Part3
No ratings yet
cuuda nvidai guide_Part3
15 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
4 MM in CUDA
No ratings yet
4 MM in CUDA
38 pages
Practice Questions
No ratings yet
Practice Questions
3 pages
OOPs Practical Final
No ratings yet
OOPs Practical Final
27 pages
Global SLP - Review Meeting
No ratings yet
Global SLP - Review Meeting
29 pages
Presentation
No ratings yet
Presentation
13 pages
Matrix Operations C Program
No ratings yet
Matrix Operations C Program
10 pages
Float Math
No ratings yet
Float Math
113 pages
DAA_FILE
No ratings yet
DAA_FILE
18 pages
Strassen's Algorithm for Matrix Multiplication
No ratings yet
Strassen's Algorithm for Matrix Multiplication
6 pages
22l-6819
No ratings yet
22l-6819
8 pages
STL - Bvector C++ Source Code
No ratings yet
STL - Bvector C++ Source Code
14 pages
Uji Kecepatan Perkalian Matriks Dengan Menggunakan Algoritma Standar Dan Algoritma Strassen
No ratings yet
Uji Kecepatan Perkalian Matriks Dengan Menggunakan Algoritma Standar Dan Algoritma Strassen
9 pages
Mini Project (Hard Copy)
No ratings yet
Mini Project (Hard Copy)
24 pages
Strassen'SAlgorithm
No ratings yet
Strassen'SAlgorithm
3 pages
stassens
No ratings yet
stassens
6 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
22l-6831
No ratings yet
22l-6831
9 pages
OpenAcc Assignment Questions
No ratings yet
OpenAcc Assignment Questions
11 pages
Approx FFT
No ratings yet
Approx FFT
8 pages
CG
No ratings yet
CG
28 pages
Assign01
No ratings yet
Assign01
19 pages
Capt Pneum Desbts
No ratings yet
Capt Pneum Desbts
521 pages
4.1 Strassens Algorithm
No ratings yet
4.1 Strassens Algorithm
7 pages
GLM CPP
No ratings yet
GLM CPP
7 pages
Hetero Lecture Slides 002 Lecture 1 Lecture-1-8-Kernel-matrix-multiplication
No ratings yet
Hetero Lecture Slides 002 Lecture 1 Lecture-1-8-Kernel-matrix-multiplication
12 pages
My Experiments: Opencl Gpu Matrix Multiplication Program
No ratings yet
My Experiments: Opencl Gpu Matrix Multiplication Program
19 pages
b22cs028 Rakesh Assignment-4
No ratings yet
b22cs028 Rakesh Assignment-4
6 pages
OpenMP Matrix
No ratings yet
OpenMP Matrix
6 pages
Ass2 cs637 Merged Organized
No ratings yet
Ass2 cs637 Merged Organized
18 pages
Shreyash Kalaskar 19 Oops Practical No.4
No ratings yet
Shreyash Kalaskar 19 Oops Practical No.4
5 pages
Lab3
No ratings yet
Lab3
11 pages
Experiment 4 daa lab
No ratings yet
Experiment 4 daa lab
4 pages
PDC assignment
No ratings yet
PDC assignment
9 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
#Include "Math/Mmath.H" #Include : Extern
No ratings yet
#Include "Math/Mmath.H" #Include : Extern
15 pages
CUDA
No ratings yet
CUDA
3 pages
Typedef Struct Float: #Include #Include #Include
No ratings yet
Typedef Struct Float: #Include #Include #Include
13 pages
CUDA_part-2
No ratings yet
CUDA_part-2
49 pages
Eliminating The Hardware/Software Divide: Satnam Singh, Microsoft Research Cambridge, UK
No ratings yet
Eliminating The Hardware/Software Divide: Satnam Singh, Microsoft Research Cambridge, UK
146 pages
DAA Lab File-1
No ratings yet
DAA Lab File-1
8 pages
DAA 5-10
No ratings yet
DAA 5-10
12 pages
LINEAR_PROJECT
No ratings yet
LINEAR_PROJECT
24 pages
MAC Report
No ratings yet
MAC Report
6 pages
Fall 2023 - CS201P
No ratings yet
Fall 2023 - CS201P
3 pages
Ex No: 5 Curve Fitting Using Polynomial Regression: Description
No ratings yet
Ex No: 5 Curve Fitting Using Polynomial Regression: Description
5 pages
Geometry 0
No ratings yet
Geometry 0
8 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
PhotoScore - Oh-You-Crazy-Moon-193 PS
No ratings yet
PhotoScore - Oh-You-Crazy-Moon-193 PS
3 pages
PhotoScore - Mr Rhythm
No ratings yet
PhotoScore - Mr Rhythm
3 pages
neural net python sleep study -
No ratings yet
neural net python sleep study -
3 pages
PhotoScore - Mama, That Moon is He
No ratings yet
PhotoScore - Mama, That Moon is He
3 pages
Could you use me Levy-194-064 optim
No ratings yet
Could you use me Levy-194-064 optim
6 pages
Witchcraft Coleman PS
No ratings yet
Witchcraft Coleman PS
1 page
Boy what love has done to me Levy-194-062 optim
No ratings yet
Boy what love has done to me Levy-194-062 optim
8 pages
np plot linear regression torch
No ratings yet
np plot linear regression torch
5 pages
.sqlalchemy.orgen20tutor
No ratings yet
.sqlalchemy.orgen20tutor
1 page
Suburban-NT-XXSP-Installation-Instructions
No ratings yet
Suburban-NT-XXSP-Installation-Instructions
8 pages
Numby vs Numba
No ratings yet
Numby vs Numba
1 page
Pass Me By - Coleman sheet keyGup3tinWaccord - Tin Whistle(1)
No ratings yet
Pass Me By - Coleman sheet keyGup3tinWaccord - Tin Whistle(1)
1 page
The Kid on the Mountan
No ratings yet
The Kid on the Mountan
1 page
402
No ratings yet
402
2 pages
distribution
No ratings yet
distribution
1 page
Tom Swift and His Motor Cycle
100% (1)
Tom Swift and His Motor Cycle
214 pages
The Joy of It (Waltz) on the SessionPDF_240909_231602
No ratings yet
The Joy of It (Waltz) on the SessionPDF_240909_231602
1 page
The-Pullman-Porters-On-Parade-1913 optim
No ratings yet
The-Pullman-Porters-On-Parade-1913 optim
6 pages
The Missing Chums
No ratings yet
The Missing Chums
234 pages
Chim Chim Cheree - Sherman
No ratings yet
Chim Chim Cheree - Sherman
1 page
The Moving Picture Boys On The Coast
No ratings yet
The Moving Picture Boys On The Coast
235 pages
Rio Rita
No ratings yet
Rio Rita
4 pages
11.12.2023chemistry PRELIM QP 2023-24
No ratings yet
11.12.2023chemistry PRELIM QP 2023-24
4 pages
Parental Consent
No ratings yet
Parental Consent
1 page
15 - Venue Exam Answer
No ratings yet
15 - Venue Exam Answer
6 pages
Thermal Guideline for DPE-1-Introduction
No ratings yet
Thermal Guideline for DPE-1-Introduction
8 pages
SAT CLASS -5
No ratings yet
SAT CLASS -5
4 pages
Convenient Synthesis of Long-Chain 1-O-Alkyl Glyceryl Ethers
No ratings yet
Convenient Synthesis of Long-Chain 1-O-Alkyl Glyceryl Ethers
4 pages
2 Marketing Plan Yoba Yoghurt
No ratings yet
2 Marketing Plan Yoba Yoghurt
11 pages
SPM Add Math Perlis 2009 Serta Skema
No ratings yet
SPM Add Math Perlis 2009 Serta Skema
48 pages
Abhijeet CV SC
No ratings yet
Abhijeet CV SC
4 pages
Cosmetics Consent
No ratings yet
Cosmetics Consent
2 pages
Monologue Design (Module 7)
No ratings yet
Monologue Design (Module 7)
2 pages
11D9N United Kingdom & 5 West Europe Country
No ratings yet
11D9N United Kingdom & 5 West Europe Country
2 pages
Lifetech Corp
No ratings yet
Lifetech Corp
6 pages
Power Distance Is The Degree To Which Less Powerful Members of Institutions and Organizations
No ratings yet
Power Distance Is The Degree To Which Less Powerful Members of Institutions and Organizations
2 pages
Mechatronics Lab Manual
100% (1)
Mechatronics Lab Manual
55 pages
Traditional Industries of Bishnupur Sub-Division of Bankura District in West Bengal - A Survey
No ratings yet
Traditional Industries of Bishnupur Sub-Division of Bankura District in West Bengal - A Survey
6 pages
Comparison Between GIS & AIS PDF
100% (3)
Comparison Between GIS & AIS PDF
9 pages
Functions of RBI (Reserve Bank of India)
No ratings yet
Functions of RBI (Reserve Bank of India)
14 pages
AGENT DETAIL
No ratings yet
AGENT DETAIL
10 pages
MikroTik Price List-May 2023-01.05.2023
No ratings yet
MikroTik Price List-May 2023-01.05.2023
5 pages
Tissue
No ratings yet
Tissue
42 pages
Us As Su Tong Science and Technology Park
No ratings yet
Us As Su Tong Science and Technology Park
50 pages
Whitepaper - Public Sector Disconnected Private Cloud With Azure Stack Hub
No ratings yet
Whitepaper - Public Sector Disconnected Private Cloud With Azure Stack Hub
25 pages
2 Mechanical-Presses UsaM S
0% (1)
2 Mechanical-Presses UsaM S
14 pages
DUTIABLE FRT INS Print
No ratings yet
DUTIABLE FRT INS Print
5 pages
Formato Candidato CV Inglés SpringLHH-2023
No ratings yet
Formato Candidato CV Inglés SpringLHH-2023
4 pages
G&B #4B, 2nd Edit, Unit 11
No ratings yet
G&B #4B, 2nd Edit, Unit 11
15 pages
SINAMICS S120 - Function - Manual - Safety - Integrated
No ratings yet
SINAMICS S120 - Function - Manual - Safety - Integrated
232 pages

joint_matrix_bfloat16_modified

Uploaded by

joint_matrix_bfloat16_modified

Uploaded by

pip install lightning[extra]

//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==//

// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX=1

using namespace sycl;

template <typename T, size_t NUM_ROWS, size_t NUM_COLS>

template <typename T1, typename T2, size_t M, size_t N, size_t K>

joint_matrix_fill(sg, sub_c, 0);

static constexpr size_t MATRIX_M = TM * 128;

unsigned short make_bf16(float x)

void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N,

big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);

auto start = std::chrono::steady_clock::now();

bool res = true;

You might also like

void matrix_multiply_ref(int A_mem, int B_mem, int *C_mem, int M, int N,