Blame - mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp - external/github.com/llvm/llvm-project.git

blob: 523d234f27b06a7a11ab3472492ed1db502ab9e3 [file] [log] [blame]

Alex Zinenko	4ead2cf	2020-05-14 12:41:35	[diff] [blame]	1	//===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===//
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	2	//
Mehdi Amini	3085710	2020-01-26 03:58:30	[diff] [blame]	3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
Mehdi Amini	56222a0	2019-12-23 17:35:36	[diff] [blame]	4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	6	//
Mehdi Amini	56222a0	2019-12-23 17:35:36	[diff] [blame]	7	//===----------------------------------------------------------------------===//
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	8	//
				9	// This implements a straightforward conversion of an loop nest into a GPU
				10	// kernel. The caller is expected to guarantee that the conversion is correct
				11	// or to further transform the kernel to ensure correctness.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
Alex Zinenko	4ead2cf	2020-05-14 12:41:35	[diff] [blame]	15	#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	16
Alex Zinenko	971b8dd	2019-11-14 18:34:46	[diff] [blame]	17	#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
Rob Suderman	e708471	2020-03-20 21:18:47	[diff] [blame]	18	#include "mlir/Dialect/Affine/IR/AffineOps.h"
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	19	#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
River Riddle	23aa5a7	2022-02-26 22:49:54	[diff] [blame^]	20	#include "mlir/Dialect/Func/IR/FuncOps.h"
Alex Zinenko	60965b4	2019-07-25 07:40:48	[diff] [blame]	21	#include "mlir/Dialect/GPU/GPUDialect.h"
Stephan Herhut	7a7eacc	2020-02-21 15:18:22	[diff] [blame]	22	#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
Julian Gross	e231070	2021-02-10 12:53:11	[diff] [blame]	23	#include "mlir/Dialect/MemRef/IR/MemRef.h"
Alex Zinenko	c25b20c	2020-05-11 13:00:48	[diff] [blame]	24	#include "mlir/Dialect/SCF/SCF.h"
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	25	#include "mlir/IR/AffineExpr.h"
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	26	#include "mlir/IR/BlockAndValueMapping.h"
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	27	#include "mlir/IR/Builders.h"
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	28	#include "mlir/Pass/Pass.h"
				29	#include "mlir/Transforms/DialectConversion.h"
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	30	#include "mlir/Transforms/Passes.h"
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	31	#include "mlir/Transforms/RegionUtils.h"
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	32	#include "llvm/ADT/Sequence.h"
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	33	#include "llvm/Support/Debug.h"
				34
				35	#define DEBUG_TYPE "loops-to-gpu"
				36
				37	using namespace mlir;
Alex Zinenko	c25b20c	2020-05-11 13:00:48	[diff] [blame]	38	using namespace mlir::scf;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	39
Vladislav Vinogradov	ec03bbe	2021-08-20 11:25:42	[diff] [blame]	40	// Name of internal attribute to mark visited operations during conversion.
				41	//
				42	// NOTE: The conversion originally used the following legality criteria:
				43	// `!parallelOp->hasAttr(gpu::getMappingAttrName())`
				44	// But the provided pattern might reject some cases based on more detailed
				45	// analysis of the `mapping` attribute.
				46	// To avoid dialect conversion failure due to non-converted illegal operation
				47	// we use this extra Unit attribute as a marker, that the operation was checked
				48	// by the pattern and is should be considered as legal in the following legality
				49	// checks. The `finalizeParallelLoopToGPUConversion` function performs clean up
				50	// of this extra attributes ans is supposed to be called after the dialect
				51	// conversion.
				52	//
				53	// TODO: Implement a cleaner solution, factoring out the "matching" logic
				54	// from the pattern and its callees into a separate function that can be called
				55	// from both the pattern and the op legality check.
				56	static constexpr StringLiteral kVisitedAttrName = "SCFToGPU_visited";
				57
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	58	// Extract an indexed value from KernelDim3.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	59	static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	60	switch (pos) {
				61	case 0:
				62	return dim3.x;
				63	case 1:
				64	return dim3.y;
				65	case 2:
				66	return dim3.z;
				67	default:
				68	llvm_unreachable("dim3 position out of bounds");
				69	}
				70	return nullptr;
				71	}
				72
				73	// Get the lower bound-related operands of a loop operation.
				74	static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
				75	return forOp.getLowerBoundOperands();
				76	}
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	77
				78	// Get the upper bound-related operands of a loop operation.
				79	static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
				80	return forOp.getUpperBoundOperands();
				81	}
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	82
				83	// Get a Value that corresponds to the loop step. If the step is an attribute,
				84	// materialize a corresponding constant using builder.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	85	static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	86	return builder.create<arith::ConstantIndexOp>(forOp.getLoc(),
				87	forOp.getStep());
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	88	}
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	89
				90	// Get a Value for the loop lower bound. If the value requires computation,
				91	// materialize the instructions using builder.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	92	static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	93	return lowerAffineLowerBound(forOp, builder);
				94	}
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	95
				96	// Get a Value for the loop upper bound. If the value requires computation,
				97	// materialize the instructions using builder.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	98	static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	99	return lowerAffineUpperBound(forOp, builder);
				100	}
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	101
				102	// Check the structure of the loop nest:
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	103	// - there are enough loops to map to numDims;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	104	// - the loops are perfectly nested;
				105	// - the loop bounds can be computed above the outermost loop.
				106	// This roughly corresponds to the "matcher" part of the pattern-based
				107	// rewriting infrastructure.
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	108	static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp,
				109	unsigned numDims) {
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	110	Region &limit = forOp.region();
				111	for (unsigned i = 0, e = numDims; i < e; ++i) {
				112	Operation *nested = &forOp.getBody()->front();
				113	if (!areValuesDefinedAbove(getLowerBoundOperands(forOp), limit) \|\|
				114	!areValuesDefinedAbove(getUpperBoundOperands(forOp), limit))
				115	return forOp.emitError(
				116	"loops with bounds depending on other mapped loops "
				117	"are not supported");
				118
				119	// The innermost loop can have an arbitrary body, skip the perfect nesting
				120	// check for it.
				121	if (i == e - 1)
				122	break;
				123
				124	auto begin = forOp.getBody()->begin(), end = forOp.getBody()->end();
				125	if (forOp.getBody()->empty() \|\| std::next(begin, 2) != end)
				126	return forOp.emitError("expected perfectly nested loops in the body");
				127
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	128	if (!(forOp = dyn_cast<AffineForOp>(nested)))
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	129	return nested->emitError("expected a nested loop");
				130	}
				131	return success();
				132	}
				133
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	134	static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp,
				135	unsigned numBlockDims,
				136	unsigned numThreadDims) {
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	137	if (numBlockDims < 1 \|\| numThreadDims < 1) {
				138	LLVM_DEBUG(llvm::dbgs() << "nothing to map");
				139	return success();
				140	}
				141
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	142	if (numBlockDims > 3) {
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	143	return forOp.emitError("cannot map to more than 3 block dimensions");
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	144	}
				145	if (numThreadDims > 3) {
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	146	return forOp.emitError("cannot map to more than 3 thread dimensions");
				147	}
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	148	return checkAffineLoopNestMappableImpl(forOp, numBlockDims + numThreadDims);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	149	}
				150
				151	namespace {
				152	// Helper structure that holds common state of the loop to GPU kernel
				153	// conversion.
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	154	struct AffineLoopToGpuConverter {
				155	Optional<AffineForOp> collectBounds(AffineForOp forOp, unsigned numLoops);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	156
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	157	void createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp,
				158	unsigned numBlockDims, unsigned numThreadDims);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	159
				160	// Ranges of the loops mapped to blocks or threads.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	161	SmallVector<Value, 6> dims;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	162	// Lower bounds of the loops mapped to blocks or threads.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	163	SmallVector<Value, 6> lbs;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	164	// Induction variables of the loops mapped to blocks or threads.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	165	SmallVector<Value, 6> ivs;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	166	// Steps of the loops mapped to blocks or threads.
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	167	SmallVector<Value, 6> steps;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	168	};
				169	} // namespace
				170
				171	// Return true if the value is obviously a constant "one".
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	172	static bool isConstantOne(Value value) {
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	173	if (auto def = value.getDefiningOp<arith::ConstantIndexOp>())
				174	return def.value() == 1;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	175	return false;
				176	}
				177
				178	// Collect ranges, bounds, steps and induction variables in preparation for
				179	// mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel.
				180	// This may fail if the IR for computing loop bounds cannot be constructed, for
				181	// example if an affine loop uses semi-affine maps. Return the last loop to be
				182	// mapped on success, llvm::None on failure.
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	183	Optional<AffineForOp>
				184	AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	185	OpBuilder builder(forOp.getOperation());
				186	dims.reserve(numLoops);
				187	lbs.reserve(numLoops);
				188	ivs.reserve(numLoops);
				189	steps.reserve(numLoops);
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	190	AffineForOp currentLoop = forOp;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	191	for (unsigned i = 0; i < numLoops; ++i) {
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	192	Value lowerBound = getOrEmitLowerBound(currentLoop, builder);
				193	Value upperBound = getOrEmitUpperBound(currentLoop, builder);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	194	if (!lowerBound \|\| !upperBound) {
				195	return llvm::None;
				196	}
				197
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	198	Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(),
				199	upperBound, lowerBound);
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	200	Value step = getOrCreateStep(currentLoop, builder);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	201	if (!isConstantOne(step))
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	202	range = builder.create<arith::DivSIOp>(currentLoop.getLoc(), range, step);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	203	dims.push_back(range);
				204
				205	lbs.push_back(lowerBound);
				206	ivs.push_back(currentLoop.getInductionVar());
				207	steps.push_back(step);
				208
				209	if (i != numLoops - 1)
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	210	currentLoop = cast<AffineForOp>(&currentLoop.getBody()->front());
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	211	}
				212	return currentLoop;
				213	}
				214
				215	// Replace the rooted at "rootForOp" with a GPU launch operation. This expects
				216	// "innermostForOp" to point to the last loop to be transformed to the kernel,
				217	// and to have (numBlockDims + numThreadDims) perfectly nested loops between
				218	// "rootForOp" and "innermostForOp".
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	219	void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
				220	AffineForOp innermostForOp,
				221	unsigned numBlockDims,
				222	unsigned numThreadDims) {
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	223	OpBuilder builder(rootForOp.getOperation());
				224	// Prepare the grid and block sizes for the launch operation. If there is
				225	// no loop mapped to a specific dimension, use constant "1" as its size.
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	226	Value constOne =
				227	(numBlockDims < 3 \|\| numThreadDims < 3)
				228	? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1)
				229	: nullptr;
Stephan Herhut	84695dd	2020-01-30 10:27:17	[diff] [blame]	230	Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	231	Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
				232	Value gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
Stephan Herhut	84695dd	2020-01-30 10:27:17	[diff] [blame]	233	Value blockSizeX = numThreadDims > 0 ? dims[numBlockDims] : constOne;
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	234	Value blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
				235	Value blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	236
				237	// Create a launch op and move the body region of the innermost loop to the
Stephan Herhut	283b5e7	2020-01-31 09:29:29	[diff] [blame]	238	// launch op.
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	239	auto launchOp = builder.create<gpu::LaunchOp>(
				240	rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
Stephan Herhut	283b5e7	2020-01-31 09:29:29	[diff] [blame]	241	blockSizeY, blockSizeZ);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	242
				243	// Replace the loop terminator (loops contain only a single block) with the
Stephan Herhut	283b5e7	2020-01-31 09:29:29	[diff] [blame]	244	// gpu terminator and move the operations from the loop body block to the gpu
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	245	// launch body block. Do not move the entire block because of the difference
				246	// in block arguments.
Nicolas Vasilache	0002e29	2019-07-16 19:20:15	[diff] [blame]	247	Operation &terminator = innermostForOp.getBody()->back();
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	248	Location terminatorLoc = terminator.getLoc();
				249	terminator.erase();
Nicolas Vasilache	0002e29	2019-07-16 19:20:15	[diff] [blame]	250	builder.setInsertionPointToEnd(innermostForOp.getBody());
Stephan Herhut	2692751	2020-01-29 12:59:36	[diff] [blame]	251	builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None);
Alex Zinenko	e96150e	2019-12-06 22:28:54	[diff] [blame]	252	launchOp.body().front().getOperations().splice(
				253	launchOp.body().front().begin(),
Nicolas Vasilache	0002e29	2019-07-16 19:20:15	[diff] [blame]	254	innermostForOp.getBody()->getOperations());
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	255
				256	// Remap the loop iterators to use block/thread identifiers instead. Loops
				257	// may iterate from LB with step S whereas GPU thread/block ids always iterate
				258	// from 0 to N with step 1. Therefore, loop induction variables are replaced
				259	// with (gpu-thread/block-id * S) + LB.
Alex Zinenko	e96150e	2019-12-06 22:28:54	[diff] [blame]	260	builder.setInsertionPointToStart(&launchOp.body().front());
Mehdi Amini	02b6fb2	2021-12-20 19:45:05	[diff] [blame]	261	auto *lbArgumentIt = lbs.begin();
				262	auto *stepArgumentIt = steps.begin();
Mehdi Amini	e4853be	2022-01-02 22:02:14	[diff] [blame]	263	for (const auto &en : llvm::enumerate(ivs)) {
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	264	Value id =
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	265	en.index() < numBlockDims
				266	? getDim3Value(launchOp.getBlockIds(), en.index())
				267	: getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	268	Value step = steps[en.index()];
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	269	if (!isConstantOne(step))
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	270	id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	271
River Riddle	e62a695	2019-12-23 22:45:01	[diff] [blame]	272	Value ivReplacement =
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	273	builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
River Riddle	2bdf33c	2020-01-11 16:54:04	[diff] [blame]	274	en.value().replaceAllUsesWith(ivReplacement);
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	275	std::advance(lbArgumentIt, 1);
				276	std::advance(stepArgumentIt, 1);
				277	}
				278
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	279	// We are done and can erase the original outermost loop.
				280	rootForOp.erase();
				281	}
				282
				283	// Generic loop to GPU kernel conversion function.
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	284	static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
				285	unsigned numBlockDims,
				286	unsigned numThreadDims) {
				287	if (failed(checkAffineLoopNestMappable(forOp, numBlockDims, numThreadDims)))
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	288	return failure();
				289
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	290	AffineLoopToGpuConverter converter;
Alex Zinenko	80e2871	2019-07-09 12:26:18	[diff] [blame]	291	auto maybeInnerLoop =
				292	converter.collectBounds(forOp, numBlockDims + numThreadDims);
				293	if (!maybeInnerLoop)
				294	return failure();
				295	converter.createLaunch(forOp, *maybeInnerLoop, numBlockDims, numThreadDims);
				296
				297	return success();
				298	}
				299
				300	LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp,
				301	unsigned numBlockDims,
				302	unsigned numThreadDims) {
MaheshRavishankar	2bcd192	2020-06-02 05:42:33	[diff] [blame]	303	return ::convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims);
Mahesh Ravishankar	9cbbd8f	2019-11-01 17:51:33	[diff] [blame]	304	}
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	305
				306	namespace {
				307	struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
				308	using OpRewritePattern<ParallelOp>::OpRewritePattern;
				309
River Riddle	3145427	2020-03-18 03:07:55	[diff] [blame]	310	LogicalResult matchAndRewrite(ParallelOp parallelOp,
				311	PatternRewriter &rewriter) const override;
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	312	};
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	313	} // namespace
				314
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	315	/// Tries to derive a static upper bound from the defining operation of
				316	/// `upperBound`.
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	317	static Value deriveStaticUpperBound(Value upperBound,
				318	PatternRewriter &rewriter) {
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	319	if (auto op = upperBound.getDefiningOp<arith::ConstantIndexOp>()) {
Tres Popp	5851671	2020-04-07 12:12:04	[diff] [blame]	320	return op;
				321	}
				322
Sean Silva	98eead8	2020-05-10 00:52:35	[diff] [blame]	323	if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	324	for (const AffineExpr &result : minOp.map().getResults()) {
Tres Popp	5851671	2020-04-07 12:12:04	[diff] [blame]	325	if (auto constExpr = result.dyn_cast<AffineConstantExpr>()) {
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	326	return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(),
				327	constExpr.getValue());
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	328	}
				329	}
				330	}
Tres Popp	5851671	2020-04-07 12:12:04	[diff] [blame]	331
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	332	if (auto multiplyOp = upperBound.getDefiningOp<arith::MulIOp>()) {
				333	if (auto lhs = dyn_cast_or_null<arith::ConstantIndexOp>(
Tres Popp	5851671	2020-04-07 12:12:04	[diff] [blame]	334	deriveStaticUpperBound(multiplyOp.getOperand(0), rewriter)
				335	.getDefiningOp()))
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	336	if (auto rhs = dyn_cast_or_null<arith::ConstantIndexOp>(
Tres Popp	5851671	2020-04-07 12:12:04	[diff] [blame]	337	deriveStaticUpperBound(multiplyOp.getOperand(1), rewriter)
				338	.getDefiningOp())) {
				339	// Assumptions about the upper bound of minimum computations no longer
				340	// work if multiplied by a negative value, so abort in this case.
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	341	if (lhs.value() < 0 \|\| rhs.value() < 0)
Tres Popp	5851671	2020-04-07 12:12:04	[diff] [blame]	342	return {};
				343
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	344	return rewriter.create<arith::ConstantIndexOp>(
				345	multiplyOp.getLoc(), lhs.value() * rhs.value());
Tres Popp	5851671	2020-04-07 12:12:04	[diff] [blame]	346	}
				347	}
				348
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	349	return {};
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	350	}
				351
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	352	static bool isMappedToProcessor(gpu::Processor processor) {
				353	return processor != gpu::Processor::Sequential;
				354	}
				355
				356	static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
				357	switch (processor) {
				358	case gpu::Processor::BlockX:
				359	return 0;
				360	case gpu::Processor::BlockY:
				361	return 1;
				362	case gpu::Processor::BlockZ:
				363	return 2;
				364	case gpu::Processor::ThreadX:
				365	return 3;
				366	case gpu::Processor::ThreadY:
				367	return 4;
				368	case gpu::Processor::ThreadZ:
				369	return 5;
				370	default:;
				371	}
				372	llvm_unreachable(
				373	"invalid processor type while retrieving launch op argument number");
				374	}
				375
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	376	/// Modifies the current transformation state to capture the effect of the given
Alex Zinenko	60f443b	2020-05-13 10:12:30	[diff] [blame]	377	/// `scf.parallel` operation on index substitutions and the operations to be
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	378	/// inserted.
				379	/// Specifically, if a dimension of a parallel loop is mapped to a hardware id,
				380	/// this function will
				381	/// - compute the loop index based on the hardware id and affine map from the
				382	/// mapping and update `cloningMap` to substitute all uses.
				383	/// - derive a new upper bound for the hardware id and augment the provided
				384	/// `gpu.launch operation` accordingly.
				385	/// - if the upper bound is imprecise, insert a conditional in the `gpu.launch`
				386	/// and update the rewriter to insert into the conditional's body.
				387	/// If the dimension is mapped to sequential,
				388	/// - insert a for loop into the body and update the rewriter to insert into
				389	/// the for loop's body.
				390	/// - update the `cloningMap` to replace uses of the index with the index of
				391	/// the new for loop.
				392	/// In either case,
				393	/// - append the instructions from the loops body to worklist, in reverse order.
				394	/// To note the end of the current scope in case a loop or conditional was
				395	/// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
				396	/// worklist. This signals the processor of the worklist to pop the rewriter
				397	/// one scope-level up.
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	398	static LogicalResult processParallelLoop(
				399	ParallelOp parallelOp, gpu::LaunchOp launchOp,
				400	BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist,
				401	DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
River Riddle	9db53a1	2020-07-07 08:35:23	[diff] [blame]	402	// TODO: Verify that this is a valid GPU mapping.
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	403	// processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
Stephan Herhut	7a7eacc	2020-02-21 15:18:22	[diff] [blame]	404	ArrayAttr mapping =
Christian Sigg	0bf4a82	2020-12-09 10:50:18	[diff] [blame]	405	parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	406
River Riddle	9db53a1	2020-07-07 08:35:23	[diff] [blame]	407	// TODO: Support reductions.
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	408	if (!mapping \|\| parallelOp.getNumResults() != 0)
				409	return failure();
				410
				411	Location loc = parallelOp.getLoc();
				412
				413	auto launchIndependent = [&launchOp](Value val) {
Christian Sigg	0bf4a82	2020-12-09 10:50:18	[diff] [blame]	414	return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	415	};
				416
Eric Christopher	f3b9332	2020-02-14 01:18:53	[diff] [blame]	417	auto ensureLaunchIndependent = [&rewriter,
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	418	launchIndependent](Value val) -> Value {
				419	if (launchIndependent(val))
				420	return val;
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	421	if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
				422	return rewriter.create<arith::ConstantOp>(constOp.getLoc(),
Jacques Pienaar	cfb72fd3	2021-10-25 01:36:33	[diff] [blame]	423	constOp.getValue());
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	424	return {};
				425	};
				426
Jacques Pienaar	c0342a2	2021-12-20 16:03:43	[diff] [blame]	427	for (auto config : llvm::zip(
				428	mapping, parallelOp.getInductionVars(), parallelOp.getLowerBound(),
				429	parallelOp.getUpperBound(), parallelOp.getStep())) {
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	430	Attribute mappingAttribute;
				431	Value iv, lowerBound, upperBound, step;
				432	std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	433	auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapping>();
				434	if (!annotation)
				435	return parallelOp.emitOpError()
				436	<< "expected mapping attribute for lowering to GPU";
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	437	Value newIndex;
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	438	gpu::Processor processor = gpu::getProcessor(annotation);
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	439
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	440	if (isMappedToProcessor(processor)) {
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	441	// Use the corresponding thread/grid index as replacement for the loop iv.
Rahul Joshi	e2b7161	2020-07-11 00:07:29	[diff] [blame]	442	Value operand =
				443	launchOp.body().getArgument(getLaunchOpArgumentNum(processor));
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	444	// Take the indexmap and add the lower bound and step computations in.
				445	// This computes operand * step + lowerBound.
				446	// Use an affine map here so that it composes nicely with the provided
				447	// annotation.
				448	AffineMap lowerAndStep = AffineMap::get(
				449	1, 2,
				450	rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
				451	rewriter.getAffineSymbolExpr(1));
				452	newIndex = rewriter.create<AffineApplyOp>(
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	453	loc, annotation.map().getValue().compose(lowerAndStep),
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	454	ValueRange{operand, step, lowerBound});
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	455	// If there was also a bound, insert that, too.
River Riddle	9db53a1	2020-07-07 08:35:23	[diff] [blame]	456	// TODO: Check that we do not assign bounds twice.
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	457	if (annotation.bound().getValue()) {
Kazuaki Ishizaki	e5a8512	2020-03-26 18:51:37	[diff] [blame]	458	// We pass as the single operand to the bound-map the number of
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	459	// iterations, which is (upperBound - lowerBound) ceilDiv step. To
				460	// support inner loops with dynamic upper bounds (as generated by e.g.
				461	// tiling), try to derive a max for the bounds. If the used bound for
				462	// the hardware id is imprecise, wrap the contained code into a
				463	// conditional. If the lower-bound is constant or defined before the
				464	// launch, we can use it in the launch bounds. Otherwise fail.
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	465	if (!launchIndependent(lowerBound) &&
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	466	!isa_and_nonnull<arith::ConstantOp>(lowerBound.getDefiningOp()))
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	467	return failure();
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	468	// The step must also be constant or defined outside of the loop nest.
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	469	if (!launchIndependent(step) &&
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	470	!isa_and_nonnull<arith::ConstantOp>(step.getDefiningOp()))
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	471	return failure();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	472	// If the upper-bound is constant or defined before the launch, we can
				473	// use it in the launch bounds directly. Otherwise try derive a bound.
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	474	bool boundIsPrecise =
				475	launchIndependent(upperBound) \|\|
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	476	isa_and_nonnull<arith::ConstantOp>(upperBound.getDefiningOp());
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	477	{
				478	PatternRewriter::InsertionGuard guard(rewriter);
				479	rewriter.setInsertionPoint(launchOp);
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	480	if (!boundIsPrecise) {
				481	upperBound = deriveStaticUpperBound(upperBound, rewriter);
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	482	if (!upperBound) {
Stephan Herhut	5da2423	2020-11-12 17:36:14	[diff] [blame]	483	return rewriter.notifyMatchFailure(
				484	parallelOp,
				485	"cannot derive loop-invariant upper bound for number of"
				486	"iterations");
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	487	}
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	488	}
				489	// Compute the number of iterations needed. We compute this as an
				490	// affine expression ceilDiv (upperBound - lowerBound) step. We use
				491	// affine.apply here so that it composes nicely with the provided map.
Tres Popp	72d5ac9	2020-10-20 09:32:48	[diff] [blame]	492	AffineMap stepMap = AffineMap::get(
				493	1, 2,
				494	((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
				495	.ceilDiv(rewriter.getAffineSymbolExpr(1))));
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	496	Value launchBound = rewriter.create<AffineApplyOp>(
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	497	loc, annotation.bound().getValue().compose(stepMap),
Stephan Herhut	5e6d724	2020-02-24 15:02:50	[diff] [blame]	498	ValueRange{
				499	ensureLaunchIndependent(
				500	cloningMap.lookupOrDefault(upperBound)),
				501	ensureLaunchIndependent(
				502	cloningMap.lookupOrDefault(lowerBound)),
				503	ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	504	// todo(herhut,ravishankarm): Update the behavior of setMappingAttr
				505	// when this condition is relaxed.
				506	if (bounds.find(processor) != bounds.end()) {
Stephan Herhut	5da2423	2020-11-12 17:36:14	[diff] [blame]	507	return rewriter.notifyMatchFailure(
				508	parallelOp, "cannot redefine the bound for processor " +
				509	Twine(static_cast<int64_t>(processor)));
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	510	}
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	511	bounds[processor] = launchBound;
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	512	}
				513	if (!boundIsPrecise) {
				514	// We are using an approximation, create a surrounding conditional.
				515	Value originalBound = std::get<3>(config);
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	516	arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(
				517	loc, arith::CmpIPredicate::slt, newIndex,
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	518	cloningMap.lookupOrDefault(originalBound));
Alex Zinenko	c25b20c	2020-05-11 13:00:48	[diff] [blame]	519	scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
Jacques Pienaar	c0342a2	2021-12-20 16:03:43	[diff] [blame]	520	rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	521	// Put a sentinel into the worklist so we know when to pop out of the
				522	// if body again. We use the launchOp here, as that cannot be part of
				523	// the bodies instruction.
				524	worklist.push_back(launchOp.getOperation());
				525	}
				526	}
				527	} else {
				528	// Create a sequential for loop.
Alex Zinenko	c25b20c	2020-05-11 13:00:48	[diff] [blame]	529	auto loopOp = rewriter.create<scf::ForOp>(
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	530	loc, cloningMap.lookupOrDefault(lowerBound),
				531	cloningMap.lookupOrDefault(upperBound),
				532	cloningMap.lookupOrDefault(step));
				533	newIndex = loopOp.getInductionVar();
				534	rewriter.setInsertionPointToStart(loopOp.getBody());
				535	// Put a sentinel into the worklist so we know when to pop out of the loop
				536	// body again. We use the launchOp here, as that cannot be part of the
				537	// bodies instruction.
				538	worklist.push_back(launchOp.getOperation());
				539	}
				540	cloningMap.map(iv, newIndex);
				541	}
Artur Bialas	396e7f4	2020-09-25 07:21:16	[diff] [blame]	542
				543	// Propagate custom user defined optional attributes, that can be used at
				544	// later stage, such as extension data for GPU kernel dispatch
Marius Brehler	56774bd	2021-02-26 13:28:32	[diff] [blame]	545	for (const auto &namedAttr : parallelOp->getAttrs()) {
River Riddle	0c7890c	2021-11-18 05:23:32	[diff] [blame]	546	if (namedAttr.getName() == gpu::getMappingAttrName() \|\|
				547	namedAttr.getName() == ParallelOp::getOperandSegmentSizeAttr())
Artur Bialas	396e7f4	2020-09-25 07:21:16	[diff] [blame]	548	continue;
River Riddle	0c7890c	2021-11-18 05:23:32	[diff] [blame]	549	launchOp->setAttr(namedAttr.getName(), namedAttr.getValue());
Artur Bialas	396e7f4	2020-09-25 07:21:16	[diff] [blame]	550	}
				551
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	552	Block *body = parallelOp.getBody();
				553	worklist.reserve(worklist.size() + body->getOperations().size());
				554	for (Operation &op : llvm::reverse(body->without_terminator()))
				555	worklist.push_back(&op);
				556	return success();
				557	}
				558
Alex Zinenko	60f443b	2020-05-13 10:12:30	[diff] [blame]	559	/// Lower a `scf.parallel` operation into a corresponding `gpu.launch`
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	560	/// operation.
				561	///
				562	/// This essentially transforms a loop nest into a corresponding SIMT function.
Alex Zinenko	60f443b	2020-05-13 10:12:30	[diff] [blame]	563	/// The conversion is driven by mapping annotations on the `scf.parallel`
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	564	/// operations. The mapping is provided via a `DictionaryAttribute` named
				565	/// `mapping`, which has three entries:
				566	/// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are
				567	/// thread dimensions and 6 is sequential.
				568	/// - map : An affine map that is used to pre-process hardware ids before
				569	/// substitution.
				570	/// - bound : An affine map that is used to compute the bound of the hardware
				571	/// id based on an upper bound of the number of iterations.
Alex Zinenko	60f443b	2020-05-13 10:12:30	[diff] [blame]	572	/// If the `scf.parallel` contains nested `scf.parallel` operations, those
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	573	/// need to be annotated, as well. Structurally, the transformation works by
Alex Zinenko	60f443b	2020-05-13 10:12:30	[diff] [blame]	574	/// splicing all operations from nested `scf.parallel` operations into a single
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	575	/// sequence. Indices mapped to hardware ids are substituted with those ids,
				576	/// wheras sequential mappings result in a sequential for-loop. To have more
				577	/// flexibility when mapping code to hardware ids, the transform supports two
				578	/// affine maps. The first `map` is used to compute the actual index for
				579	/// substitution from the hardware id. The second `bound` is used to compute the
				580	/// launch dimension for the hardware id from the number of iterations the
				581	/// mapped loop is performing. Note that the number of iterations might be
				582	/// imprecise if the corresponding loop-bounds are loop-dependent. In such case,
				583	/// the hardware id might iterate over additional indices. The transformation
				584	/// caters for this by predicating the created sequence of instructions on
				585	/// the actual loop bound. This only works if an static upper bound for the
Kazuaki Ishizaki	5aacce3	2020-04-05 02:30:01	[diff] [blame]	586	/// dynamic loop bound can be derived, currently via analyzing `affine.min`
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	587	/// operations.
River Riddle	3145427	2020-03-18 03:07:55	[diff] [blame]	588	LogicalResult
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	589	ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
				590	PatternRewriter &rewriter) const {
Vladislav Vinogradov	ec03bbe	2021-08-20 11:25:42	[diff] [blame]	591	// Mark the operation as visited for recursive legality check.
				592	parallelOp->setAttr(kVisitedAttrName, rewriter.getUnitAttr());
				593
Stephan Herhut	5da2423	2020-11-12 17:36:14	[diff] [blame]	594	// We can only transform starting at the outer-most loop. Launches inside of
				595	// parallel loops are not supported.
Christian Sigg	0bf4a82	2020-12-09 10:50:18	[diff] [blame]	596	if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
Stephan Herhut	5da2423	2020-11-12 17:36:14	[diff] [blame]	597	return failure();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	598	// Create a launch operation. We start with bound one for all grid/block
				599	// sizes. Those will be refined later as we discover them from mappings.
				600	Location loc = parallelOp.getLoc();
Mogball	a54f4ea	2021-10-12 23:14:57	[diff] [blame]	601	Value constantOne =
				602	rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	603	gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
				604	parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
				605	constantOne, constantOne);
				606	rewriter.setInsertionPointToEnd(&launchOp.body().front());
				607	rewriter.create<gpu::TerminatorOp>(loc);
				608	rewriter.setInsertionPointToStart(&launchOp.body().front());
				609
				610	BlockAndValueMapping cloningMap;
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	611	llvm::DenseMap<gpu::Processor, Value> launchBounds;
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	612	SmallVector<Operation *, 16> worklist;
				613	if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	614	launchBounds, rewriter)))
River Riddle	3145427	2020-03-18 03:07:55	[diff] [blame]	615	return failure();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	616
				617	// Whether we have seen any side-effects. Reset when leaving an inner scope.
				618	bool seenSideeffects = false;
				619	// Whether we have left a nesting scope (and hence are no longer innermost).
				620	bool leftNestingScope = false;
				621	while (!worklist.empty()) {
				622	Operation *op = worklist.pop_back_val();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	623	// Now walk over the body and clone it.
Alex Zinenko	60f443b	2020-05-13 10:12:30	[diff] [blame]	624	// TODO: This is only correct if there either is no further scf.parallel
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	625	// nested or this code is side-effect free. Otherwise we might need
Kazuaki Ishizaki	e5a8512	2020-03-26 18:51:37	[diff] [blame]	626	// predication. We are overly conservative for now and only allow
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	627	// side-effects in the innermost scope.
				628	if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
				629	// Before entering a nested scope, make sure there have been no
				630	// sideeffects until now.
				631	if (seenSideeffects)
River Riddle	3145427	2020-03-18 03:07:55	[diff] [blame]	632	return failure();
Alex Zinenko	60f443b	2020-05-13 10:12:30	[diff] [blame]	633	// A nested scf.parallel needs insertion of code to compute indices.
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	634	// Insert that now. This will also update the worklist with the loops
				635	// body.
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	636	if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
				637	worklist, launchBounds, rewriter)))
River Riddle	3145427	2020-03-18 03:07:55	[diff] [blame]	638	return failure();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	639	} else if (op == launchOp.getOperation()) {
				640	// Found our sentinel value. We have finished the operations from one
				641	// nesting level, pop one level back up.
Mehdi Amini	02b6fb2	2021-12-20 19:45:05	[diff] [blame]	642	auto *parent = rewriter.getInsertionPoint()->getParentOp();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	643	rewriter.setInsertionPointAfter(parent);
				644	leftNestingScope = true;
				645	seenSideeffects = false;
				646	} else {
				647	// Otherwise we copy it over.
				648	Operation clone = rewriter.clone(op, cloningMap);
				649	cloningMap.map(op->getResults(), clone->getResults());
				650	// Check for side effects.
River Riddle	0ddba0b	2020-03-12 21:06:41	[diff] [blame]	651	// TODO: Handle region side effects properly.
				652	seenSideeffects \|= !MemoryEffectOpInterface::hasNoEffect(clone) \|\|
				653	clone->getNumRegions() != 0;
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	654	// If we are no longer in the innermost scope, sideeffects are disallowed.
				655	if (seenSideeffects && leftNestingScope)
River Riddle	3145427	2020-03-18 03:07:55	[diff] [blame]	656	return failure();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	657	}
				658	}
				659
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	660	// Now that we succeeded creating the launch operation, also update the
				661	// bounds.
				662	for (auto bound : launchBounds)
MaheshRavishankar	46bb661	2020-03-24 22:55:36	[diff] [blame]	663	launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
				664	std::get<1>(bound));
Stephan Herhut	10ec186	2020-03-02 17:05:42	[diff] [blame]	665
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	666	rewriter.eraseOp(parallelOp);
River Riddle	3145427	2020-03-18 03:07:55	[diff] [blame]	667	return success();
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	668	}
				669
Chris Lattner	dc4e913	2021-03-22 23:58:34	[diff] [blame]	670	void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) {
				671	patterns.add<ParallelToGpuLaunchLowering>(patterns.getContext());
Stephan Herhut	715783d	2020-02-07 12:22:10	[diff] [blame]	672	}
Stephan Herhut	5da2423	2020-11-12 17:36:14	[diff] [blame]	673
				674	void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
Julian Gross	e231070	2021-02-10 12:53:11	[diff] [blame]	675	target.addLegalDialect<memref::MemRefDialect>();
Stephan Herhut	5da2423	2020-11-12 17:36:14	[diff] [blame]	676	target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
Vladislav Vinogradov	ec03bbe	2021-08-20 11:25:42	[diff] [blame]	677	return !parallelOp->hasAttr(gpu::getMappingAttrName()) \|\|
				678	parallelOp->hasAttr(kVisitedAttrName);
				679	});
				680	}
				681
				682	void mlir::finalizeParallelLoopToGPUConversion(Operation *op) {
				683	op->walk([](scf::ParallelOp parallelOp) {
				684	parallelOp->removeAttr(kVisitedAttrName);
Stephan Herhut	5da2423	2020-11-12 17:36:14	[diff] [blame]	685	});
				686	}