| //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains the implementation of reduction with KMPC interface. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "Debug.h" |
| #include "Interface.h" |
| #include "Mapping.h" |
| #include "State.h" |
| #include "Synchronization.h" |
| #include "Types.h" |
| #include "Utils.h" |
| |
| using namespace _OMP; |
| |
| namespace { |
| |
| #pragma omp begin declare target device_type(nohost) |
| |
| void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { |
| for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { |
| shflFct(reduce_data, /*LaneId - not used= */ 0, |
| /*Offset = */ mask, /*AlgoVersion=*/0); |
| } |
| } |
| |
| void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, |
| uint32_t size, uint32_t tid) { |
| uint32_t curr_size; |
| uint32_t mask; |
| curr_size = size; |
| mask = curr_size / 2; |
| while (mask > 0) { |
| shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); |
| curr_size = (curr_size + 1) / 2; |
| mask = curr_size / 2; |
| } |
| } |
| |
| #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 |
| static uint32_t gpu_irregular_simd_reduce(void *reduce_data, |
| ShuffleReductFnTy shflFct) { |
| uint32_t size, remote_id, physical_lane_id; |
| physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); |
| __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); |
| __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
| uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; |
| __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); |
| do { |
| Liveness = mapping::activemask(); |
| remote_id = utils::ffs(Liveness & lanemask_gt); |
| size = utils::popc(Liveness); |
| logical_lane_id /= 2; |
| shflFct(reduce_data, /*LaneId =*/logical_lane_id, |
| /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); |
| } while (logical_lane_id % 2 == 0 && size > 1); |
| return (logical_lane_id == 0); |
| } |
| #endif |
| |
| static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars, |
| uint64_t reduce_size, |
| void *reduce_data, |
| ShuffleReductFnTy shflFct, |
| InterWarpCopyFnTy cpyFct, |
| bool isSPMDExecutionMode, bool) { |
| uint32_t BlockThreadId = mapping::getThreadIdInBlock(); |
| if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false)) |
| BlockThreadId = 0; |
| uint32_t NumThreads = omp_get_num_threads(); |
| if (NumThreads == 1) |
| return 1; |
| /* |
| * This reduce function handles reduction within a team. It handles |
| * parallel regions in both L1 and L2 parallelism levels. It also |
| * supports Generic, SPMD, and NoOMP modes. |
| * |
| * 1. Reduce within a warp. |
| * 2. Warp master copies value to warp 0 via shared memory. |
| * 3. Warp 0 reduces to a single value. |
| * 4. The reduced value is available in the thread that returns 1. |
| */ |
| |
| #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 |
| uint32_t WarpsNeeded = |
| (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
| uint32_t WarpId = mapping::getWarpId(); |
| |
| // Volta execution model: |
| // For the Generic execution mode a parallel region either has 1 thread and |
| // beyond that, always a multiple of 32. For the SPMD execution mode we may |
| // have any number of threads. |
| if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) |
| gpu_regular_warp_reduce(reduce_data, shflFct); |
| else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. |
| gpu_irregular_warp_reduce(reduce_data, shflFct, |
| /*LaneCount=*/NumThreads % mapping::getWarpSize(), |
| /*LaneId=*/mapping::getThreadIdInBlock() % |
| mapping::getWarpSize()); |
| |
| // When we have more than [mapping::getWarpSize()] number of threads |
| // a block reduction is performed here. |
| // |
| // Only L1 parallel region can enter this if condition. |
| if (NumThreads > mapping::getWarpSize()) { |
| // Gather all the reduced values from each warp |
| // to the first warp. |
| cpyFct(reduce_data, WarpsNeeded); |
| |
| if (WarpId == 0) |
| gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| BlockThreadId); |
| } |
| return BlockThreadId == 0; |
| #else |
| __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
| if (Liveness == lanes::All) // Full warp |
| gpu_regular_warp_reduce(reduce_data, shflFct); |
| else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes |
| gpu_irregular_warp_reduce(reduce_data, shflFct, |
| /*LaneCount=*/utils::popc(Liveness), |
| /*LaneId=*/mapping::getThreadIdInBlock() % |
| mapping::getWarpSize()); |
| else { // Dispersed lanes. Only threads in L2 |
| // parallel region may enter here; return |
| // early. |
| return gpu_irregular_simd_reduce(reduce_data, shflFct); |
| } |
| |
| // When we have more than [mapping::getWarpSize()] number of threads |
| // a block reduction is performed here. |
| // |
| // Only L1 parallel region can enter this if condition. |
| if (NumThreads > mapping::getWarpSize()) { |
| uint32_t WarpsNeeded = |
| (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
| // Gather all the reduced values from each warp |
| // to the first warp. |
| cpyFct(reduce_data, WarpsNeeded); |
| |
| uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); |
| if (WarpId == 0) |
| gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| BlockThreadId); |
| |
| return BlockThreadId == 0; |
| } |
| |
| // Get the OMP thread Id. This is different from BlockThreadId in the case of |
| // an L2 parallel region. |
| return TId == 0; |
| #endif // __CUDA_ARCH__ >= 700 |
| } |
| |
| uint32_t roundToWarpsize(uint32_t s) { |
| if (s < mapping::getWarpSize()) |
| return 1; |
| return (s & ~(unsigned)(mapping::getWarpSize() - 1)); |
| } |
| |
| uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } |
| |
| static volatile uint32_t IterCnt = 0; |
| static volatile uint32_t Cnt = 0; |
| |
| } // namespace |
| |
| extern "C" { |
| int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( |
| IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, |
| void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { |
| FunctionTracingRAII(); |
| return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data, |
| shflFct, cpyFct, mapping::isSPMDMode(), |
| false); |
| } |
| |
| int32_t __kmpc_nvptx_teams_reduce_nowait_v2( |
| IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, |
| void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, |
| ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, |
| ListGlobalFnTy glredFct) { |
| FunctionTracingRAII(); |
| |
| // Terminate all threads in non-SPMD mode except for the master thread. |
| uint32_t ThreadId = mapping::getThreadIdInBlock(); |
| if (mapping::isGenericMode()) { |
| if (!mapping::isMainThreadInGenericMode()) |
| return 0; |
| ThreadId = 0; |
| } |
| |
| // In non-generic mode all workers participate in the teams reduction. |
| // In generic mode only the team master participates in the teams |
| // reduction because the workers are waiting for parallel work. |
| uint32_t NumThreads = omp_get_num_threads(); |
| uint32_t TeamId = omp_get_team_num(); |
| uint32_t NumTeams = omp_get_num_teams(); |
| static unsigned SHARED(Bound); |
| static unsigned SHARED(ChunkTeamCount); |
| |
| // Block progress for teams greater than the current upper |
| // limit. We always only allow a number of teams less or equal |
| // to the number of slots in the buffer. |
| bool IsMaster = (ThreadId == 0); |
| while (IsMaster) { |
| Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST); |
| if (TeamId < Bound + num_of_records) |
| break; |
| } |
| |
| if (IsMaster) { |
| int ModBockId = TeamId % num_of_records; |
| if (TeamId < num_of_records) { |
| lgcpyFct(GlobalBuffer, ModBockId, reduce_data); |
| } else |
| lgredFct(GlobalBuffer, ModBockId, reduce_data); |
| |
| fence::system(__ATOMIC_SEQ_CST); |
| |
| // Increment team counter. |
| // This counter is incremented by all teams in the current |
| // BUFFER_SIZE chunk. |
| ChunkTeamCount = |
| atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST); |
| } |
| // Synchronize |
| if (mapping::isSPMDMode()) |
| __kmpc_barrier(Loc, TId); |
| |
| // reduce_data is global or shared so before being reduced within the |
| // warp we need to bring it in local memory: |
| // local_reduce_data = reduce_data[i] |
| // |
| // Example for 3 reduction variables a, b, c (of potentially different |
| // types): |
| // |
| // buffer layout (struct of arrays): |
| // a, a, ..., a, b, b, ... b, c, c, ... c |
| // |__________| |
| // num_of_records |
| // |
| // local_data_reduce layout (struct): |
| // a, b, c |
| // |
| // Each thread will have a local struct containing the values to be |
| // reduced: |
| // 1. do reduction within each warp. |
| // 2. do reduction across warps. |
| // 3. write the final result to the main reduction variable |
| // by returning 1 in the thread holding the reduction result. |
| |
| // Check if this is the very last team. |
| unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); |
| if (ChunkTeamCount == NumTeams - Bound - 1) { |
| // |
| // Last team processing. |
| // |
| if (ThreadId >= NumRecs) |
| return 0; |
| NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); |
| if (ThreadId >= NumThreads) |
| return 0; |
| |
| // Load from buffer and reduce. |
| glcpyFct(GlobalBuffer, ThreadId, reduce_data); |
| for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) |
| glredFct(GlobalBuffer, i, reduce_data); |
| |
| // Reduce across warps to the warp master. |
| if (NumThreads > 1) { |
| gpu_regular_warp_reduce(reduce_data, shflFct); |
| |
| // When we have more than [mapping::getWarpSize()] number of threads |
| // a block reduction is performed here. |
| uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); |
| if (ActiveThreads > mapping::getWarpSize()) { |
| uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / |
| mapping::getWarpSize(); |
| // Gather all the reduced values from each warp |
| // to the first warp. |
| cpyFct(reduce_data, WarpsNeeded); |
| |
| uint32_t WarpId = ThreadId / mapping::getWarpSize(); |
| if (WarpId == 0) |
| gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| ThreadId); |
| } |
| } |
| |
| if (IsMaster) { |
| Cnt = 0; |
| IterCnt = 0; |
| return 1; |
| } |
| return 0; |
| } |
| if (IsMaster && ChunkTeamCount == num_of_records - 1) { |
| // Allow SIZE number of teams to proceed writing their |
| // intermediate results to the global buffer. |
| atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records), |
| __ATOMIC_SEQ_CST); |
| } |
| |
| return 0; |
| } |
| |
| void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); } |
| |
| void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); } |
| } |
| |
| #pragma omp end declare target |