SlideShare a Scribd company logo
Basic Example: Matrix
                  Multiplication using CUDA

                   General-purpose Programming of Massively Parallel
                                 Graphics Processors
                                               Shiraz University, Spring 2010
                                                  Instructor: Reza Azimi


                                Some materials/slides are adapted from:
                          Andreas Moshovos’ Course at the University of Toronto
                              UIUC course by Wen-Mei Hwu and David Kirk
                                                                                                                     




     (      6 07 4    7 6 5 4 32 1 0)                                    0
           A    @  9 8                                                       B A



 void MatrixMulOnHost( float* M, float* N, float* P, int Width) {
   for (int i = 0; i < Width; ++i)                   N
     for (int j = 0; j < Width; ++j) {
         float sum = 0;                                                                                         k
         for (int k = 0; k < Width; ++k) {
                                                                                                                        WIDTH




              float a = M[i * Width + k];              j
              float b = N[k * Width + j];
              sum += a * b;
          }
          P[i * Width + j] = sum;
     }
 }
                                                          M                                         P

                                                                     i
                                                                                                                        WIDTH




                                                              k

Adapted From:
             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©       WIDTH       '      2
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                1
60     IH 34   4 G F ED
     A  8P A       A@




 __global__
 void MatrixMulKernel(float* d_M,
                                                                                                    d_N
                      float* d_N,
                      float* d_P,                                                                                 k
                      int Width) {




                                                                                                                          WIDTH
   int row = threadIdx.y;
   int col = threadIdx.x;                                                                           col
   float P_val = 0;                                                                                 (threadIdx.x)
   for (int k = 0; k  Width; ++k) {
     float M_elem = d_M[row * Width + k];
     float N_elem = d_N[k * Width + col];
     P_val += M_elem * N_elem;
   }                            d_M                                                                 d_P
   d_p[row*Width+col] = P_val;        row
 }                                    (threadIdx.y)




                                                                                                                          WIDTH
                                                            k

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©         WIDTH       C      3
Adapted From:
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




     4W 7              60    6 33 (         I2  R
    @     AT @ V U AT A  8TP        T8T S 8   A




   void MatrixMulOnDevice(float* M,
                          float* N,
                          float* P,
                          int Width)
   {
      int matrix_size = Width * Width * sizeof(float);
      float *d_M, *d_N, *d_P;

        // Allocate and Load M and N to device memory
        cudaMalloc(d_M, matrix_size);
        cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice);

        cudaMalloc(d_N, matrix_size);
        cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice);

        // Allocate P on the device
        cudaMalloc(d_P, matrix_size);

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
Adapted From:
                         £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     Q      4
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                  2
26         60    6    R 34   4G                                                             3I7 4        7
     a `   B U AT A  8TP   YA      A@                                                                        8




         // Setup the execution configuration
         dim3 dimGrid(1, 1);
         dim3 dimBlock(Width, Width);


      // Launch the device computation threads!
      MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
   Width);

         // Copy back the results from device to host
         cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost);

         // Free up the device memory matrices
         cudaFree(d_P);
         cudaFree(d_M);
         cudaFree(d_N);
     © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          X   5
     ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign




    Only One Thread Block Used
     c
         One Block of threads compute                                                      Grid 1                    d_N
         matrix d_P                                                                     Block 1
                                                                                                                       2

                                                                                                                       4
     d
         Each thread
           e
               Loads a row of matrix d_M                                                       Thread
                                                                                                (2, 2)
                                                                                                                       2
           e
               Loads a column of matrix d_N                                                                            6
           e
               Perform one multiply and
               addition for each pair of d_M
               and d_N elements
           e
               Computes one element of d_P


                                                                                 3     2       5         4             48
   Size of matrix limited by
   the number of threads
   allowed in a thread block
                                                                                     WIDTH                           d_P
                                                                                     d_M
Adapted From:
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
                                                           ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          b   6




                                                                                                                                    3
6 0 I 36)                  6r 4 6     4 p   p   i 4  0D
  A   8                     s@    @  q UT @  V  PT     Y   hg




                                                             threadIdx.x                           TILE_WIDTH


                                                         d_P
                                                                                                                    TILE_


                               threadIdx.y




                                                                                         7

Each thread is assigned to
a Tile of
TILE_WIDTHxTILE_WIDTH
entries


            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                f




  Solution 1: Give Each Thread More
  Work
   __global__ void MatrixMulKernel(float* d_M,
                      float* d_N,
                      float* d_P,
                      int Width) {
     int start_row = threadIdx.y * TILE_WIDTH;
     int end_row = start_row + TILE_WIDTH;
     int start_col = threadIdx.x * TILE_WIDTH;
     int end_col = start_col + TILE_WIDTH;

       for (int row = start_row; row  end_row; row++) {
          for(int col = start_col; col  end_col; col++) {
             float P_val = 0;
             for (int k = 0; k  Width; ++k) {
                float M_elem = d_M[row * Width + k];
                float N_elem = d_N[k * Width + col];
                P_val += M_elem * N_elem;
             }                              With one block we utilize
             d_p[row*Width+col] = P_val;    only one multiprocessor!
          }
            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                t

       }
   }




                                                                                                                            4
63    4 p   4 32 0 3I   47 F     6 0 I 36)                                                           7
  w UT @  V       8   q      hv A   8                                                           sP


                                                             threadIdx.x

                                    blockIdx.x                                     blockDim.x

       d_P
                                                                                                blockDim.y



      blockIdx.y




                                                              9                                 assigned to a
     threadIdx.y                                                                                thread

                                                                                                assigned to a
                                                                                                thread block



       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  u




Solution 2: Use Multiple Thread
Blocks
__global__
void MatrixMulKernel(float* d_M,
                   float* d_N,
                   float* d_P,
                   int Width) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  float P_val = 0;

    for (int k = 0; k  Width; ++k) {
      float M_elem = d_M[row * Width + k];
      float N_elem = d_N[k * Width + col];
      P_val += M_elem * N_elem;
    }
    d_p[row*Width+col] = P_val;
}


       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©               x 




                                                                                                                 5
26         60    6    R 34   4G                                                      3I7 4       7
 a `   B U AT A  8TP   YA      A@                                                                8




     int block_size = 64;

     // Setup the execution configuration
     dim3 dimGrid(Width/block_size, Width/block_size);
     dim3 dimBlock(block_size, block_size);


   // Launch the device computation threads!
   MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

     …
                                                                            Size of matrix limited by the
                                                                            number of threads allowed
                                                                            on a device



        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     11




  60    01 0    4 p         D                                                    7
 A  8T8     ‚ UT @  V  € v yV




 ƒ
     Max Number of Threads per Block: 512
 ƒ
     Max Number of Blocks per Streaming
     Multiprocessor: 8
 ƒ
     Number of Streaming Multiprocessors: 30
 ƒ
     Total Number of Threads Available =
                30 x 8 x 512 = 122880

                          Let me double-check this!

        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                ' 




                                                                                                                      6
6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B



                                                       threadIdx.x
                                blockIdx.x                                    blockDim.x

     d_P
                                                                                                 blockDim.y



     blockIdx.y


                                                                                                 TILE_WIDTH


                                                        13
    threadIdx.y
                                                                                                              TILE_WIDT




          © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  C 




     6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B




__global__ void MatrixMulKernel(float* d_M,
                     float* d_N,
                     float* d_P,
                     int Width) {
  int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH;
  int end_row = start_row + TILE_WIDTH;
  int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH;
  int end_col = start_col + TILE_WIDTH;

    for (int row = start_row; row  end_row; row++) {
       for(int col = start_col; col  end_col; col++) {
          float P_val = 0;
          for (int k = 0; k  Width; ++k) {
             float M_elem = d_M[row * Width + k];
             float N_elem = d_N[k * Width + col];
             P_val += M_elem * N_elem;
          }
          d_p[row*Width+col] = P_val;
       }
    }
}         © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  Q 




                                                                                                                          7
Ad

More Related Content

Similar to Matrix multiplication using CUDA (17)

Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)
Ray Buse
 
Mtech Communication Networks lab Manual.pdf
Mtech Communication Networks  lab Manual.pdfMtech Communication Networks  lab Manual.pdf
Mtech Communication Networks lab Manual.pdf
T.D. Shashikala
 
Implementation of graphs, adjaceny matrix
Implementation of graphs, adjaceny matrixImplementation of graphs, adjaceny matrix
Implementation of graphs, adjaceny matrix
VatsalSharma64
 
openFrameworks 007 - 3D
openFrameworks 007 - 3DopenFrameworks 007 - 3D
openFrameworks 007 - 3D
roxlu
 
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeksBeginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
JinTaek Seo
 
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
Dr.Costas Sachpazis
 
Triggering patterns of topology changes in dynamic attributed graphs
Triggering patterns of topology changes in dynamic attributed graphsTriggering patterns of topology changes in dynamic attributed graphs
Triggering patterns of topology changes in dynamic attributed graphs
INSA Lyon - L'Institut National des Sciences Appliquées de Lyon
 
Lab Practices and Works Documentation / Report on Computer Graphics
Lab Practices and Works Documentation / Report on Computer GraphicsLab Practices and Works Documentation / Report on Computer Graphics
Lab Practices and Works Documentation / Report on Computer Graphics
Rup Chowdhury
 
CS 354 Transformation, Clipping, and Culling
CS 354 Transformation, Clipping, and CullingCS 354 Transformation, Clipping, and Culling
CS 354 Transformation, Clipping, and Culling
Mark Kilgard
 
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency BandDWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
IOSR Journals
 
10CSL67 CG LAB PROGRAM 1
10CSL67 CG LAB PROGRAM 110CSL67 CG LAB PROGRAM 1
10CSL67 CG LAB PROGRAM 1
Vanishree Arun
 
Writing a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 CanvasWriting a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 Canvas
Steve Purkis
 
rgDefense
rgDefensergDefense
rgDefense
Rajesh Gandham
 
oop Lecture 5
oop Lecture 5oop Lecture 5
oop Lecture 5
Anwar Ul Haq
 
Kirti Kumawat
Kirti KumawatKirti Kumawat
Kirti Kumawat
dezyneecole
 
2.docx
2.docx2.docx
2.docx
LanceFerdo1
 
Interactive High-Dimensional Visualization of Social Graphs
Interactive High-Dimensional Visualization of Social GraphsInteractive High-Dimensional Visualization of Social Graphs
Interactive High-Dimensional Visualization of Social Graphs
Tokyo Tech (Tokyo Institute of Technology)
 
Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)
Ray Buse
 
Mtech Communication Networks lab Manual.pdf
Mtech Communication Networks  lab Manual.pdfMtech Communication Networks  lab Manual.pdf
Mtech Communication Networks lab Manual.pdf
T.D. Shashikala
 
Implementation of graphs, adjaceny matrix
Implementation of graphs, adjaceny matrixImplementation of graphs, adjaceny matrix
Implementation of graphs, adjaceny matrix
VatsalSharma64
 
openFrameworks 007 - 3D
openFrameworks 007 - 3DopenFrameworks 007 - 3D
openFrameworks 007 - 3D
roxlu
 
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeksBeginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
JinTaek Seo
 
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
Dr.Costas Sachpazis
 
Lab Practices and Works Documentation / Report on Computer Graphics
Lab Practices and Works Documentation / Report on Computer GraphicsLab Practices and Works Documentation / Report on Computer Graphics
Lab Practices and Works Documentation / Report on Computer Graphics
Rup Chowdhury
 
CS 354 Transformation, Clipping, and Culling
CS 354 Transformation, Clipping, and CullingCS 354 Transformation, Clipping, and Culling
CS 354 Transformation, Clipping, and Culling
Mark Kilgard
 
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency BandDWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
IOSR Journals
 
10CSL67 CG LAB PROGRAM 1
10CSL67 CG LAB PROGRAM 110CSL67 CG LAB PROGRAM 1
10CSL67 CG LAB PROGRAM 1
Vanishree Arun
 
Writing a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 CanvasWriting a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 Canvas
Steve Purkis
 

More from Piyush Mittal (20)

Power mock
Power mockPower mock
Power mock
Piyush Mittal
 
Design pattern tutorial
Design pattern tutorialDesign pattern tutorial
Design pattern tutorial
Piyush Mittal
 
Reflection
ReflectionReflection
Reflection
Piyush Mittal
 
Gpu archi
Gpu archiGpu archi
Gpu archi
Piyush Mittal
 
Cuda Architecture
Cuda ArchitectureCuda Architecture
Cuda Architecture
Piyush Mittal
 
Intel open mp
Intel open mpIntel open mp
Intel open mp
Piyush Mittal
 
Intro to parallel computing
Intro to parallel computingIntro to parallel computing
Intro to parallel computing
Piyush Mittal
 
Cuda toolkit reference manual
Cuda toolkit reference manualCuda toolkit reference manual
Cuda toolkit reference manual
Piyush Mittal
 
Channel coding
Channel codingChannel coding
Channel coding
Piyush Mittal
 
Basics of Coding Theory
Basics of Coding TheoryBasics of Coding Theory
Basics of Coding Theory
Piyush Mittal
 
Java cheat sheet
Java cheat sheetJava cheat sheet
Java cheat sheet
Piyush Mittal
 
Google app engine cheat sheet
Google app engine cheat sheetGoogle app engine cheat sheet
Google app engine cheat sheet
Piyush Mittal
 
Git cheat sheet
Git cheat sheetGit cheat sheet
Git cheat sheet
Piyush Mittal
 
Css cheat sheet
Css cheat sheetCss cheat sheet
Css cheat sheet
Piyush Mittal
 
Ubuntu cheat sheet
Ubuntu cheat sheetUbuntu cheat sheet
Ubuntu cheat sheet
Piyush Mittal
 
Php cheat sheet
Php cheat sheetPhp cheat sheet
Php cheat sheet
Piyush Mittal
 
oracle 9i cheat sheet
oracle 9i cheat sheetoracle 9i cheat sheet
oracle 9i cheat sheet
Piyush Mittal
 
Open ssh cheet sheat
Open ssh cheet sheatOpen ssh cheet sheat
Open ssh cheet sheat
Piyush Mittal
 
Ad

Recently uploaded (20)

Sinhala_Male_Names.pdf Sinhala_Male_Name
Sinhala_Male_Names.pdf Sinhala_Male_NameSinhala_Male_Names.pdf Sinhala_Male_Name
Sinhala_Male_Names.pdf Sinhala_Male_Name
keshanf79
 
World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...
World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...
World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...
larencebapu132
 
Metamorphosis: Life's Transformative Journey
Metamorphosis: Life's Transformative JourneyMetamorphosis: Life's Transformative Journey
Metamorphosis: Life's Transformative Journey
Arshad Shaikh
 
Odoo Inventory Rules and Routes v17 - Odoo Slides
Odoo Inventory Rules and Routes v17 - Odoo SlidesOdoo Inventory Rules and Routes v17 - Odoo Slides
Odoo Inventory Rules and Routes v17 - Odoo Slides
Celine George
 
To study the nervous system of insect.pptx
To study the nervous system of insect.pptxTo study the nervous system of insect.pptx
To study the nervous system of insect.pptx
Arshad Shaikh
 
apa-style-referencing-visual-guide-2025.pdf
apa-style-referencing-visual-guide-2025.pdfapa-style-referencing-visual-guide-2025.pdf
apa-style-referencing-visual-guide-2025.pdf
Ishika Ghosh
 
How to Subscribe Newsletter From Odoo 18 Website
How to Subscribe Newsletter From Odoo 18 WebsiteHow to Subscribe Newsletter From Odoo 18 Website
How to Subscribe Newsletter From Odoo 18 Website
Celine George
 
SCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptx
SCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptxSCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptx
SCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptx
Ronisha Das
 
YSPH VMOC Special Report - Measles Outbreak Southwest US 5-3-2025.pptx
YSPH VMOC Special Report - Measles Outbreak  Southwest US 5-3-2025.pptxYSPH VMOC Special Report - Measles Outbreak  Southwest US 5-3-2025.pptx
YSPH VMOC Special Report - Measles Outbreak Southwest US 5-3-2025.pptx
Yale School of Public Health - The Virtual Medical Operations Center (VMOC)
 
Marie Boran Special Collections Librarian Hardiman Library, University of Gal...
Marie Boran Special Collections Librarian Hardiman Library, University of Gal...Marie Boran Special Collections Librarian Hardiman Library, University of Gal...
Marie Boran Special Collections Librarian Hardiman Library, University of Gal...
Library Association of Ireland
 
Quality Contril Analysis of Containers.pdf
Quality Contril Analysis of Containers.pdfQuality Contril Analysis of Containers.pdf
Quality Contril Analysis of Containers.pdf
Dr. Bindiya Chauhan
 
Stein, Hunt, Green letter to Congress April 2025
Stein, Hunt, Green letter to Congress April 2025Stein, Hunt, Green letter to Congress April 2025
Stein, Hunt, Green letter to Congress April 2025
Mebane Rash
 
Presentation on Tourism Product Development By Md Shaifullar Rabbi
Presentation on Tourism Product Development By Md Shaifullar RabbiPresentation on Tourism Product Development By Md Shaifullar Rabbi
Presentation on Tourism Product Development By Md Shaifullar Rabbi
Md Shaifullar Rabbi
 
Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...
Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...
Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...
Library Association of Ireland
 
How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...
How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...
How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...
Celine George
 
Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...
Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...
Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...
Library Association of Ireland
 
pulse ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulse
pulse  ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulsepulse  ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulse
pulse ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulse
sushreesangita003
 
Presentation of the MIPLM subject matter expert Erdem Kaya
Presentation of the MIPLM subject matter expert Erdem KayaPresentation of the MIPLM subject matter expert Erdem Kaya
Presentation of the MIPLM subject matter expert Erdem Kaya
MIPLM
 
LDMMIA Reiki Master Spring 2025 Mini Updates
LDMMIA Reiki Master Spring 2025 Mini UpdatesLDMMIA Reiki Master Spring 2025 Mini Updates
LDMMIA Reiki Master Spring 2025 Mini Updates
LDM Mia eStudios
 
P-glycoprotein pamphlet: iteration 4 of 4 final
P-glycoprotein pamphlet: iteration 4 of 4 finalP-glycoprotein pamphlet: iteration 4 of 4 final
P-glycoprotein pamphlet: iteration 4 of 4 final
bs22n2s
 
Sinhala_Male_Names.pdf Sinhala_Male_Name
Sinhala_Male_Names.pdf Sinhala_Male_NameSinhala_Male_Names.pdf Sinhala_Male_Name
Sinhala_Male_Names.pdf Sinhala_Male_Name
keshanf79
 
World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...
World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...
World war-1(Causes & impacts at a glance) PPT by Simanchala Sarab(BABed,sem-4...
larencebapu132
 
Metamorphosis: Life's Transformative Journey
Metamorphosis: Life's Transformative JourneyMetamorphosis: Life's Transformative Journey
Metamorphosis: Life's Transformative Journey
Arshad Shaikh
 
Odoo Inventory Rules and Routes v17 - Odoo Slides
Odoo Inventory Rules and Routes v17 - Odoo SlidesOdoo Inventory Rules and Routes v17 - Odoo Slides
Odoo Inventory Rules and Routes v17 - Odoo Slides
Celine George
 
To study the nervous system of insect.pptx
To study the nervous system of insect.pptxTo study the nervous system of insect.pptx
To study the nervous system of insect.pptx
Arshad Shaikh
 
apa-style-referencing-visual-guide-2025.pdf
apa-style-referencing-visual-guide-2025.pdfapa-style-referencing-visual-guide-2025.pdf
apa-style-referencing-visual-guide-2025.pdf
Ishika Ghosh
 
How to Subscribe Newsletter From Odoo 18 Website
How to Subscribe Newsletter From Odoo 18 WebsiteHow to Subscribe Newsletter From Odoo 18 Website
How to Subscribe Newsletter From Odoo 18 Website
Celine George
 
SCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptx
SCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptxSCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptx
SCI BIZ TECH QUIZ (OPEN) PRELIMS XTASY 2025.pptx
Ronisha Das
 
Marie Boran Special Collections Librarian Hardiman Library, University of Gal...
Marie Boran Special Collections Librarian Hardiman Library, University of Gal...Marie Boran Special Collections Librarian Hardiman Library, University of Gal...
Marie Boran Special Collections Librarian Hardiman Library, University of Gal...
Library Association of Ireland
 
Quality Contril Analysis of Containers.pdf
Quality Contril Analysis of Containers.pdfQuality Contril Analysis of Containers.pdf
Quality Contril Analysis of Containers.pdf
Dr. Bindiya Chauhan
 
Stein, Hunt, Green letter to Congress April 2025
Stein, Hunt, Green letter to Congress April 2025Stein, Hunt, Green letter to Congress April 2025
Stein, Hunt, Green letter to Congress April 2025
Mebane Rash
 
Presentation on Tourism Product Development By Md Shaifullar Rabbi
Presentation on Tourism Product Development By Md Shaifullar RabbiPresentation on Tourism Product Development By Md Shaifullar Rabbi
Presentation on Tourism Product Development By Md Shaifullar Rabbi
Md Shaifullar Rabbi
 
Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...
Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...
Phoenix – A Collaborative Renewal of Children’s and Young People’s Services C...
Library Association of Ireland
 
How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...
How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...
How to track Cost and Revenue using Analytic Accounts in odoo Accounting, App...
Celine George
 
Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...
Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...
Niamh Lucey, Mary Dunne. Health Sciences Libraries Group (LAI). Lighting the ...
Library Association of Ireland
 
pulse ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulse
pulse  ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulsepulse  ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulse
pulse ppt.pptx Types of pulse , characteristics of pulse , Alteration of pulse
sushreesangita003
 
Presentation of the MIPLM subject matter expert Erdem Kaya
Presentation of the MIPLM subject matter expert Erdem KayaPresentation of the MIPLM subject matter expert Erdem Kaya
Presentation of the MIPLM subject matter expert Erdem Kaya
MIPLM
 
LDMMIA Reiki Master Spring 2025 Mini Updates
LDMMIA Reiki Master Spring 2025 Mini UpdatesLDMMIA Reiki Master Spring 2025 Mini Updates
LDMMIA Reiki Master Spring 2025 Mini Updates
LDM Mia eStudios
 
P-glycoprotein pamphlet: iteration 4 of 4 final
P-glycoprotein pamphlet: iteration 4 of 4 finalP-glycoprotein pamphlet: iteration 4 of 4 final
P-glycoprotein pamphlet: iteration 4 of 4 final
bs22n2s
 
Ad

Matrix multiplication using CUDA

  • 1. Basic Example: Matrix Multiplication using CUDA General-purpose Programming of Massively Parallel Graphics Processors Shiraz University, Spring 2010 Instructor: Reza Azimi Some materials/slides are adapted from: Andreas Moshovos’ Course at the University of Toronto UIUC course by Wen-Mei Hwu and David Kirk   ( 6 07 4 7 6 5 4 32 1 0) 0 A @ 9 8 B A void MatrixMulOnHost( float* M, float* N, float* P, int Width) { for (int i = 0; i < Width; ++i) N for (int j = 0; j < Width; ++j) { float sum = 0; k for (int k = 0; k < Width; ++k) { WIDTH float a = M[i * Width + k]; j float b = N[k * Width + j]; sum += a * b; } P[i * Width + j] = sum; } } M P i WIDTH k Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH ' 2 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 1
  • 2. 60 IH 34 4 G F ED A 8P A A@ __global__ void MatrixMulKernel(float* d_M, d_N float* d_N, float* d_P, k int Width) { WIDTH int row = threadIdx.y; int col = threadIdx.x; col float P_val = 0; (threadIdx.x) for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_M d_P d_p[row*Width+col] = P_val; row } (threadIdx.y) WIDTH k © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3 Adapted From: David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 4W 7 60 6 33 ( I2 R @ AT @ V U AT A 8TP T8T S 8 A void MatrixMulOnDevice(float* M, float* N, float* P, int Width) { int matrix_size = Width * Width * sizeof(float); float *d_M, *d_N, *d_P; // Allocate and Load M and N to device memory cudaMalloc(d_M, matrix_size); cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice); cudaMalloc(d_N, matrix_size); cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc(d_P, matrix_size); © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ Adapted From: £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 2
  • 3. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 // Setup the execution configuration dim3 dimGrid(1, 1); dim3 dimBlock(Width, Width); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); // Copy back the results from device to host cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost); // Free up the device memory matrices cudaFree(d_P); cudaFree(d_M); cudaFree(d_N); © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009 © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5 ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign Only One Thread Block Used c One Block of threads compute Grid 1 d_N matrix d_P Block 1 2 4 d Each thread e Loads a row of matrix d_M Thread (2, 2) 2 e Loads a column of matrix d_N 6 e Perform one multiply and addition for each pair of d_M and d_N elements e Computes one element of d_P 3 2 5 4 48 Size of matrix limited by the number of threads allowed in a thread block WIDTH d_P d_M Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6 3
  • 4. 6 0 I 36) 6r 4 6 4 p p i 4 0D A 8 s@ @ q UT @ V PT Y hg threadIdx.x TILE_WIDTH d_P TILE_ threadIdx.y 7 Each thread is assigned to a Tile of TILE_WIDTHxTILE_WIDTH entries © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f Solution 1: Give Each Thread More Work __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } With one block we utilize d_p[row*Width+col] = P_val; only one multiprocessor! } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t } } 4
  • 5. 63 4 p 4 32 0 3I 47 F 6 0 I 36) 7 w UT @ V 8 q hv A 8 sP threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y 9 assigned to a threadIdx.y thread assigned to a thread block © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © u Solution 2: Use Multiple Thread Blocks __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x  5
  • 6. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 int block_size = 64; // Setup the execution configuration dim3 dimGrid(Width/block_size, Width/block_size); dim3 dimBlock(block_size, block_size); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); … Size of matrix limited by the number of threads allowed on a device © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ ©    11 60 01 0 4 p  D 7 A 8T8 ‚ UT @ V € v yV ƒ Max Number of Threads per Block: 512 ƒ Max Number of Blocks per Streaming Multiprocessor: 8 ƒ Number of Streaming Multiprocessors: 30 ƒ Total Number of Threads Available = 30 x 8 x 512 = 122880 Let me double-check this! © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © '  6
  • 7. 6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y TILE_WIDTH 13 threadIdx.y TILE_WIDT © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C  6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } } } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q  7