[Libomptarget] Allow the device runtime to be compiled for the host

Currently the OpenMP offloading device runtime is only expected to be
compiled for the specific architecture it's targeting. This is
problematic if we want to make compiling the device runtime more general
via the standar `clang` driver rather than invoking the clang front-end
directly. This patch addresses this by primarily changing the declare
type to `nohost` so the host will not contain any of this code.
Additionally we forward declare the functions that are defined via
variants, otherwise these would cause problems on the host.

Reviewed By: jdoerfert, tianshilei1992

Differential Revision: https://reviews.llvm.org/D125260
diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h
index 36cfae7..c9e07bab 100644
--- a/openmp/libomptarget/DeviceRTL/include/Mapping.h
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -18,7 +18,7 @@
 
 namespace mapping {
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
 
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
index 2f9cbd4c..183b684 100644
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -15,7 +15,7 @@
 #include "Debug.h"
 #include "Types.h"
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 namespace _OMP {
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index e9cc9bb..b3d779a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -18,7 +18,7 @@
 
 using namespace _OMP;
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 // defined by CGOpenMPRuntimeGPU
 extern uint32_t __omp_rtl_debug_kind;
diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
index f458a1b..e97c77d 100644
--- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -18,7 +18,7 @@
 
 using namespace _OMP;
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 extern "C" {
 void __assert_assume(bool condition) { __builtin_assume(condition); }
@@ -30,6 +30,10 @@
   __builtin_trap();
 }
 
+namespace impl {
+int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t);
+}
+
 #pragma omp begin declare variant match(                                       \
     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
 int32_t vprintf(const char *, void *);
@@ -55,8 +59,7 @@
 }
 
 /// Current indentation level for the function trace. Only accessed by thread 0.
-__attribute__((loader_uninitialized))
-static uint32_t Level;
+__attribute__((loader_uninitialized)) static uint32_t Level;
 #pragma omp allocate(Level) allocator(omp_pteam_mem_alloc)
 
 DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index 8b7a8a2..74c22a6 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -19,7 +19,7 @@
 
 using namespace _OMP;
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 static void inititializeRuntime(bool IsSPMD) {
   // Order is important here.
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 21104be..48ca13a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -15,7 +15,7 @@
 #include "Types.h"
 #include "Utils.h"
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
@@ -24,6 +24,23 @@
 namespace _OMP {
 namespace impl {
 
+// Forward declarations defined to be defined for AMDGCN and NVPTX.
+const llvm::omp::GV &getGridValue();
+uint32_t getGridDim(uint32_t n, uint16_t d);
+uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size,
+                         uint16_t group_size);
+uint32_t getNumHardwareThreadsInBlock();
+LaneMaskTy activemask();
+LaneMaskTy lanemaskLT();
+LaneMaskTy lanemaskGT();
+uint32_t getThreadIdInWarp();
+uint32_t getThreadIdInBlock();
+uint32_t getKernelSize();
+uint32_t getBlockId();
+uint32_t getNumberOfBlocks();
+uint32_t getWarpId();
+uint32_t getNumberOfWarpsInBlock();
+
 /// AMDGCN Implementation
 ///
 ///{
diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
index 7284be8..554a13a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -13,11 +13,15 @@
 
 #include "Debug.h"
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 namespace _OMP {
 namespace impl {
 
+double getWTick();
+
+double getWTime();
+
 /// AMDGCN Implementation
 ///
 ///{
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 5584f34..fd419b8 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -42,7 +42,7 @@
 
 using namespace _OMP;
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 namespace {
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index dd1d30d..516da6b 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -22,7 +22,7 @@
 
 namespace {
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index a39d8d6..685c697 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -19,7 +19,7 @@
 
 using namespace _OMP;
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 /// Memory implementation
 ///
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 6b4bab0..4327871 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -19,7 +19,7 @@
 #include "Types.h"
 #include "Utils.h"
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 using namespace _OMP;
 
@@ -63,6 +63,22 @@
 }
 ///}
 
+// Forward declarations defined to be defined for AMDGCN and NVPTX.
+uint32_t atomicInc(uint32_t *A, uint32_t V, int Ordering);
+void namedBarrierInit();
+void namedBarrier();
+void fenceTeam(int Ordering);
+void fenceKernel(int Ordering);
+void fenceSystem(int Ordering);
+void syncWarp(__kmpc_impl_lanemask_t);
+void syncThreads();
+void syncThreadsAligned() { syncThreads(); }
+void unsetLock(omp_lock_t *);
+int testLock(omp_lock_t *);
+void initLock(omp_lock_t *);
+void destroyLock(omp_lock_t *);
+void setLock(omp_lock_t *);
+
 /// AMDGCN Implementation
 ///
 ///{
diff --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
index 2c80e71..06804e0 100644
--- a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
@@ -20,7 +20,7 @@
 
 using namespace _OMP;
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
                                         uint64_t TaskSizeInclPrivateValues,
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
index 0816f07..e6bcba8 100644
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -15,7 +15,7 @@
 #include "Interface.h"
 #include "Mapping.h"
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 using namespace _OMP;
 
@@ -32,6 +32,9 @@
 
 namespace impl {
 
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
+
 /// AMDGCN Implementation
 ///
 ///{
@@ -72,6 +75,10 @@
 
 #pragma omp end declare variant
 
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
+                    int32_t Width);
+
 /// AMDGCN Implementation
 ///
 ///{
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 0bdbf30..81b3f6c 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -43,7 +43,7 @@
 #define NOT_FINISHED 1
 #define LAST_CHUNK 2
 
-#pragma omp declare target
+#pragma omp begin declare target device_type(nohost)
 
 // TODO: This variable is a hack inherited from the old runtime.
 static uint64_t SHARED(Cnt);