CUDA:实现使用批量的CUBLAS的API提高程序性能
以下是一个使用CUDA实现矩阵乘法,并利用CUBLAS库提高程序性能的示例代码:
#include <iostream>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
void printMatrix(float* matrix, int rows, int cols) {
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
std::cout << matrix[i * cols + j] << " ";
}
std::cout << std::endl;
}
}
int main() {
const int m = 1024;
const int n = 1024;
const int k = 1024;
const int size_A = m * k;
const int size_B = k * n;
const int size_C = m * n;
float* h_A = new float[size_A];
float* h_B = new float[size_B];
float* h_C = new float[size_C];
// Initialize input matrices
for (int i = 0; i < size_A; ++i) {
h