CUDA:实现矩阵转置
以下是一个使用CUDA实现矩阵转置的示例代码:
#include <iostream>
#include <cuda_runtime_api.h>
#define TILE_DIM 32
#define BLOCK_ROWS 8
// CUDA核函数:矩阵转置
__global__ void transpose(float* in, float* out, int width, int height) {
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_x, blockIdx_y;
int tx, ty;
int row, col;
// 计算输入矩阵的索引
blockIdx_x = blockIdx.x;
blockIdx_y = blockIdx.y;
tx = threadIdx.x;
ty = threadIdx.y;
row = blockIdx_y * TILE_DIM + ty;
col = blockIdx_x * TILE_DIM + tx;
// 读取输入矩阵的数据到共享内存中
if (row < height && col < width) {
tile[ty][tx] = in[row * width + col];
}
__syncthreads();
// 计算输出矩阵的索引
int newRow = blockIdx_x * TILE_DIM + ty;
int newCol =