GPU为3060 12G, CPU 为 E5-2696V3 双路共36核72线程,Julia配置使用32线程
JULIA_NUM_THREADS=32
测试代码如下
Using CUDA
using BenchmarkTools
function testgpuAll(pic)
idx =(blockIdx().x - 1) * blockDim().x + threadIdx().x
idy =(blockIdx().y - 1) * blockDim().y + threadIdx().y
idz =(blockIdx().z - 1) * blockDim().z + threadIdx().z
gaptdX= gridDim().x * blockDim().x
gaptdY= gridDim().x * blockDim().x
gaptdZ= gridDim().z * blockDim().z
bbc=123
for
i in idx:gaptdX:10000,
j in idy:gaptdY:10000,
k in idz:gaptdZ:10000
tmp=i*j*k%(i+j)
bbc+=pic[i,j%1000+1,k%1000+1]
bbc%=10240
bbc*=tmp-k+j
end
return
end
function testgpuSgl(pic)
bbc=123
for
i in 1:100,
j in 1:100,
k in 1:100
tmp=i*j*k%(i+j)
bbc+=pic[i,j,k]
bbc%=10240
bbc*=tmp-k+j
end
return
end
function main()
# tdShape=(12,9,9)
tdShape=(20,6,6)
blkShape=(10,10,10)
smSize=10240
pic=CuArray{Int8}(undef,(10000,1000,1000))
pic.+=6
picCpu=Array(pic)
bbc=123
println("Test on multiple GPU with cube 10000")
@CUDA.time @cuda threads=tdShape blocks=blkShape shmem=smSize testgpuAll(pic)
println("Test on Single GPU with cube 100")
@CUDA.time @cuda threads=1 blocks=1 shmem=smSize testgpuSgl(pic)
println("Test on multiple CPU with cube 1000")
@time begin
Threads.@threads for i in 1:1000
Threads.@threads for j in 1:1000
Threads.@threads for k in 1:1000
tmp=i*j*k%(i+j)
bbc+=picCpu[i,j,k]*tmp
bbc%=10240
end
end
end
end
println("Test on Single CPU with cube 100")
@time for i in 1:100,j in 1:100, k in 1:100
tmp=i*j*k%(i+j)
bbc+=picCpu[i,j,k]*tmp
bbc%=10240
end
end
main()
测试结果如下:
Test on multiple GPU with cube 10000
3.544172 seconds (139.13 k CPU allocations: 7.233 MiB)
Test on Single GPU with cube 100
0.156663 seconds (48.15 k CPU allocations: 3.916 MiB)
Test on multiple CPU with cube 1000
86.705080 seconds (290.16 M allocations: 4.385 GiB, 32.44% gc time, 0.00% compilation time)
Test on Single CPU with cube 100
0.128581 seconds (2.43 M allocations: 37.010 MiB)