Blame - mlir/lib/Transforms/LoopFusion.cpp - external/github.com/llvm/llvm-project.git

blob: 7b7c0bb22bbbe6f609d0e452e54e0443ea2c25ed [file] [log] [blame]

MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	1	//===- LoopFusion.cpp - Code to perform loop fusion -----------------------===//
				2	//
				3	// Copyright 2019 The MLIR Authors.
				4	//
				5	// Licensed under the Apache License, Version 2.0 (the "License");
				6	// you may not use this file except in compliance with the License.
				7	// You may obtain a copy of the License at
				8	//
				9	// http://www.apache.org/licenses/LICENSE-2.0
				10	//
				11	// Unless required by applicable law or agreed to in writing, software
				12	// distributed under the License is distributed on an "AS IS" BASIS,
				13	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	// See the License for the specific language governing permissions and
				15	// limitations under the License.
				16	// =============================================================================
				17	//
				18	// This file implements loop fusion.
				19	//
				20	//===----------------------------------------------------------------------===//
				21
River Riddle	7555383	2019-01-29 05:23:53	[diff] [blame]	22	#include "mlir/AffineOps/AffineOps.h"
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	23	#include "mlir/Analysis/AffineAnalysis.h"
Uday Bondhugula	dfe07b7	2019-02-23 00:51:08	[diff] [blame]	24	#include "mlir/Analysis/AffineStructures.h"
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	25	#include "mlir/Analysis/LoopAnalysis.h"
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	26	#include "mlir/Analysis/Utils.h"
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	27	#include "mlir/IR/AffineExpr.h"
				28	#include "mlir/IR/AffineMap.h"
				29	#include "mlir/IR/Builders.h"
				30	#include "mlir/IR/BuiltinOps.h"
River Riddle	48ccae2	2019-02-20 01:17:46	[diff] [blame]	31	#include "mlir/Pass/Pass.h"
Lei Zhang	85d9b6c	2019-03-01 21:48:24	[diff] [blame^]	32	#include "mlir/StandardOps/Ops.h"
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	33	#include "mlir/Transforms/LoopUtils.h"
				34	#include "mlir/Transforms/Passes.h"
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	35	#include "mlir/Transforms/Utils.h"
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	36	#include "llvm/ADT/DenseMap.h"
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	37	#include "llvm/ADT/DenseSet.h"
				38	#include "llvm/ADT/SetVector.h"
MLIR Team	4eef795	2018-12-21 19:06:23	[diff] [blame]	39	#include "llvm/Support/CommandLine.h"
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	40	#include "llvm/Support/Debug.h"
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	41	#include "llvm/Support/raw_ostream.h"
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	42	#include <iomanip>
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	43
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	44	#define DEBUG_TYPE "loop-fusion"
				45
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	46	using llvm::SetVector;
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	47
				48	using namespace mlir;
				49
River Riddle	75c21e1	2019-01-26 06:14:04	[diff] [blame]	50	static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
				51
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	52	/// Disables fusion profitability check and fuses if valid.
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	53	static llvm::cl::opt<bool>
				54	clMaximalLoopFusion("fusion-maximal", llvm::cl::Hidden,
River Riddle	75c21e1	2019-01-26 06:14:04	[diff] [blame]	55	llvm::cl::desc("Enables maximal loop fusion"),
				56	llvm::cl::cat(clOptionsCategory));
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	57
				58	/// A threshold in percent of additional computation allowed when fusing.
				59	static llvm::cl::opt<double> clFusionAddlComputeTolerance(
				60	"fusion-compute-tolerance", llvm::cl::Hidden,
Uday Bondhugula	a1dad3a	2019-02-20 02:17:19	[diff] [blame]	61	llvm::cl::desc("Fractional increase in additional "
				62	"computation tolerated while fusing"),
River Riddle	75c21e1	2019-01-26 06:14:04	[diff] [blame]	63	llvm::cl::cat(clOptionsCategory));
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	64
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	65	static llvm::cl::opt<unsigned> clFusionFastMemorySpace(
				66	"fusion-fast-mem-space", llvm::cl::Hidden,
				67	llvm::cl::desc("Faster memory space number to promote fusion buffers to"),
				68	llvm::cl::cat(clOptionsCategory));
				69
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	70	// A local buffer of size less than or equal to this size is promoted to fast
				71	// memory.
				72	static llvm::cl::opt<unsigned long long> clFusionLocalBufThreshold(
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	73	"fusion-local-buf-threshold", llvm::cl::Hidden,
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	74	llvm::cl::desc("Threshold size (KiB) for promoting local buffers to fast "
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	75	"memory space"),
				76	llvm::cl::cat(clOptionsCategory));
				77
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	78	namespace {
				79
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	80	/// Loop fusion pass. This pass currently supports a greedy fusion policy,
				81	/// which fuses loop nests with single-writer/single-reader memref dependences
				82	/// with the goal of improving locality.
				83
				84	// TODO(andydavis) Support fusion of source loop nests which write to multiple
				85	// memrefs, where each memref can have multiple users (if profitable).
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	86	// TODO(andydavis) Extend this pass to check for fusion preventing dependences,
				87	// and add support for more general loop fusion algorithms.
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	88
River Riddle	c6c5344	2019-02-27 18:59:29	[diff] [blame]	89	struct LoopFusion : public FunctionPass<LoopFusion> {
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	90	LoopFusion(unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0)
River Riddle	c6c5344	2019-02-27 18:59:29	[diff] [blame]	91	: localBufSizeThreshold(localBufSizeThreshold),
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	92	fastMemorySpace(fastMemorySpace) {}
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	93
River Riddle	ed5fe20	2019-02-28 22:50:42	[diff] [blame]	94	void runOnFunction() override;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	95
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	96	// Any local buffers smaller than this size (in bytes) will be created in
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	97	// `fastMemorySpace` if provided.
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	98	uint64_t localBufSizeThreshold;
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	99	Optional<unsigned> fastMemorySpace = None;
				100
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	101	// The amount of additional computation that is tolerated while fusing
				102	// pair-wise as a fraction of the total computation.
				103	constexpr static double kComputeToleranceThreshold = 0.30f;
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	104	};
				105
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	106	} // end anonymous namespace
				107
River Riddle	c6c5344	2019-02-27 18:59:29	[diff] [blame]	108	FunctionPassBase *mlir::createLoopFusionPass(unsigned fastMemorySpace,
				109	uint64_t localBufSizeThreshold) {
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	110	return new LoopFusion(fastMemorySpace, localBufSizeThreshold);
				111	}
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	112
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	113	namespace {
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	114
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	115	// LoopNestStateCollector walks loop nests and collects load and store
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	116	// operations, and whether or not an IfInst was encountered in the loop nest.
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	117	struct LoopNestStateCollector {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	118	SmallVector<OpPointer<AffineForOp>, 4> forOps;
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	119	SmallVector<Instruction *, 4> loadOpInsts;
				120	SmallVector<Instruction *, 4> storeOpInsts;
River Riddle	7555383	2019-01-29 05:23:53	[diff] [blame]	121	bool hasNonForRegion = false;
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	122
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	123	void collect(Instruction *instToWalk) {
				124	instToWalk->walk([&](Instruction *opInst) {
				125	if (opInst->isa<AffineForOp>())
				126	forOps.push_back(opInst->cast<AffineForOp>());
				127	else if (opInst->getNumBlockLists() != 0)
				128	hasNonForRegion = true;
				129	else if (opInst->isa<LoadOp>())
				130	loadOpInsts.push_back(opInst);
				131	else if (opInst->isa<StoreOp>())
				132	storeOpInsts.push_back(opInst);
				133	});
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	134	}
				135	};
				136
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	137	// TODO(b/117228571) Replace when this is modeled through side-effects/op traits
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	138	static bool isMemRefDereferencingOp(const Instruction &op) {
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	139	if (op.isa<LoadOp>() \|\| op.isa<StoreOp>() \|\| op.isa<DmaStartOp>() \|\|
				140	op.isa<DmaWaitOp>())
				141	return true;
				142	return false;
				143	}
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	144
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	145	// MemRefDependenceGraph is a graph data structure where graph nodes are
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	146	// top-level instructions in a Function which contain load/store ops, and edges
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	147	// are memref dependences between the nodes.
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	148	// TODO(andydavis) Add a more flexible dependece graph representation.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	149	// TODO(andydavis) Add a depth parameter to dependence graph construction.
				150	struct MemRefDependenceGraph {
				151	public:
				152	// Node represents a node in the graph. A Node is either an entire loop nest
				153	// rooted at the top level which contains loads/stores, or a top level
				154	// load/store.
				155	struct Node {
				156	// The unique identifier of this node in the graph.
				157	unsigned id;
				158	// The top-level statment which is (or contains) loads/stores.
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	159	Instruction *inst;
Chris Lattner	5187cfc	2018-12-28 05:21:41	[diff] [blame]	160	// List of load operations.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	161	SmallVector<Instruction *, 4> loads;
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	162	// List of store op insts.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	163	SmallVector<Instruction *, 4> stores;
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	164	Node(unsigned id, Instruction *inst) : id(id), inst(inst) {}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	165
				166	// Returns the load op count for 'memref'.
Chris Lattner	3f19031	2018-12-27 22:35:10	[diff] [blame]	167	unsigned getLoadOpCount(Value *memref) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	168	unsigned loadOpCount = 0;
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	169	for (auto *loadOpInst : loads) {
				170	if (memref == loadOpInst->cast<LoadOp>()->getMemRef())
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	171	++loadOpCount;
				172	}
				173	return loadOpCount;
				174	}
				175
				176	// Returns the store op count for 'memref'.
Chris Lattner	3f19031	2018-12-27 22:35:10	[diff] [blame]	177	unsigned getStoreOpCount(Value *memref) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	178	unsigned storeOpCount = 0;
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	179	for (auto *storeOpInst : stores) {
				180	if (memref == storeOpInst->cast<StoreOp>()->getMemRef())
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	181	++storeOpCount;
				182	}
				183	return storeOpCount;
				184	}
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	185
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	186	// Returns all store ops in 'storeOps' which access 'memref'.
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	187	void getStoreOpsForMemref(Value *memref,
				188	SmallVectorImpl<Instruction > storeOps) {
				189	for (auto *storeOpInst : stores) {
				190	if (memref == storeOpInst->cast<StoreOp>()->getMemRef())
				191	storeOps->push_back(storeOpInst);
				192	}
				193	}
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	194
				195	// Returns all load ops in 'loadOps' which access 'memref'.
				196	void getLoadOpsForMemref(Value *memref,
				197	SmallVectorImpl<Instruction > loadOps) {
				198	for (auto *loadOpInst : loads) {
				199	if (memref == loadOpInst->cast<LoadOp>()->getMemRef())
				200	loadOps->push_back(loadOpInst);
				201	}
				202	}
				203
				204	// Returns all memrefs in 'loadAndStoreMemrefSet' for which this node
				205	// has at least one load and store operation.
				206	void getLoadAndStoreMemrefSet(DenseSet<Value > loadAndStoreMemrefSet) {
				207	llvm::SmallDenseSet<Value *, 2> loadMemrefs;
				208	for (auto *loadOpInst : loads) {
				209	loadMemrefs.insert(loadOpInst->cast<LoadOp>()->getMemRef());
				210	}
				211	for (auto *storeOpInst : stores) {
				212	auto *memref = storeOpInst->cast<StoreOp>()->getMemRef();
				213	if (loadMemrefs.count(memref) > 0)
				214	loadAndStoreMemrefSet->insert(memref);
				215	}
				216	}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	217	};
				218
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	219	// Edge represents a data dependece between nodes in the graph.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	220	struct Edge {
				221	// The id of the node at the other end of the edge.
MLIR Team	1e85191	2019-01-31 00:01:46	[diff] [blame]	222	// If this edge is stored in Edge = Node.inEdges[i], then
				223	// 'Node.inEdges[i].id' is the identifier of the source node of the edge.
				224	// If this edge is stored in Edge = Node.outEdges[i], then
				225	// 'Node.outEdges[i].id' is the identifier of the dest node of the edge.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	226	unsigned id;
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	227	// The SSA value on which this edge represents a dependence.
				228	// If the value is a memref, then the dependence is between graph nodes
				229	// which contain accesses to the same memref 'value'. If the value is a
				230	// non-memref value, then the dependence is between a graph node which
				231	// defines an SSA value and another graph node which uses the SSA value
				232	// (e.g. a constant instruction defining a value which is used inside a loop
				233	// nest).
				234	Value *value;
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	235	};
				236
				237	// Map from node id to Node.
				238	DenseMap<unsigned, Node> nodes;
				239	// Map from node id to list of input edges.
				240	DenseMap<unsigned, SmallVector<Edge, 2>> inEdges;
				241	// Map from node id to list of output edges.
				242	DenseMap<unsigned, SmallVector<Edge, 2>> outEdges;
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	243	// Map from memref to a count on the dependence edges associated with that
				244	// memref.
				245	DenseMap<Value *, unsigned> memrefEdgeCount;
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	246	// The next unique identifier to use for newly created graph nodes.
				247	unsigned nextNodeId = 0;
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	248
				249	MemRefDependenceGraph() {}
				250
				251	// Initializes the dependence graph based on operations in 'f'.
				252	// Returns true on success, false otherwise.
Chris Lattner	69d9e99	2018-12-28 16:48:09	[diff] [blame]	253	bool init(Function *f);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	254
				255	// Returns the graph node for 'id'.
				256	Node *getNode(unsigned id) {
				257	auto it = nodes.find(id);
				258	assert(it != nodes.end());
				259	return &it->second;
				260	}
				261
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	262	// Adds a node with 'inst' to the graph and returns its unique identifier.
				263	unsigned addNode(Instruction *inst) {
				264	Node node(nextNodeId++, inst);
				265	nodes.insert({node.id, node});
				266	return node.id;
				267	}
				268
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	269	// Remove node 'id' (and its associated edges) from graph.
				270	void removeNode(unsigned id) {
				271	// Remove each edge in 'inEdges[id]'.
				272	if (inEdges.count(id) > 0) {
				273	SmallVector<Edge, 2> oldInEdges = inEdges[id];
				274	for (auto &inEdge : oldInEdges) {
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	275	removeEdge(inEdge.id, id, inEdge.value);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	276	}
				277	}
				278	// Remove each edge in 'outEdges[id]'.
				279	if (outEdges.count(id) > 0) {
				280	SmallVector<Edge, 2> oldOutEdges = outEdges[id];
				281	for (auto &outEdge : oldOutEdges) {
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	282	removeEdge(id, outEdge.id, outEdge.value);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	283	}
				284	}
				285	// Erase remaining node state.
				286	inEdges.erase(id);
				287	outEdges.erase(id);
				288	nodes.erase(id);
				289	}
				290
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	291	// Returns true if node 'id' writes to any memref which escapes (or is an
				292	// argument to) the function/block. Returns false otherwise.
				293	bool writesToLiveInOrEscapingMemrefs(unsigned id) {
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	294	Node *node = getNode(id);
				295	for (auto *storeOpInst : node->stores) {
				296	auto *memref = storeOpInst->cast<StoreOp>()->getMemRef();
				297	auto *inst = memref->getDefiningInst();
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	298	// Return true if 'memref' is a block argument.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	299	if (!inst)
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	300	return true;
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	301	// Return true if any use of 'memref' escapes the function.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	302	for (auto &use : memref->getUses())
				303	if (!isMemRefDereferencingOp(*use.getOwner()))
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	304	return true;
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	305	}
				306	return false;
				307	}
				308
				309	// Returns true if node 'id' can be removed from the graph. Returns false
				310	// otherwise. A node can be removed from the graph iff the following
				311	// conditions are met:
				312	// *) The node does not write to any memref which escapes (or is a
				313	// function/block argument).
				314	// *) The node has no successors in the dependence graph.
				315	bool canRemoveNode(unsigned id) {
				316	if (writesToLiveInOrEscapingMemrefs(id))
				317	return false;
				318	Node *node = getNode(id);
				319	for (auto *storeOpInst : node->stores) {
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	320	// Return false if there exist out edges from 'id' on 'memref'.
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	321	if (getOutEdgeCount(id, storeOpInst->cast<StoreOp>()->getMemRef()) > 0)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	322	return false;
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	323	}
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	324	return true;
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	325	}
				326
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	327	// Returns true iff there is an edge from node 'srcId' to node 'dstId' which
				328	// is for 'value' if non-null, or for any value otherwise. Returns false
				329	// otherwise.
				330	bool hasEdge(unsigned srcId, unsigned dstId, Value *value = nullptr) {
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	331	if (outEdges.count(srcId) == 0 \|\| inEdges.count(dstId) == 0) {
				332	return false;
				333	}
				334	bool hasOutEdge = llvm::any_of(outEdges[srcId], [=](Edge &edge) {
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	335	return edge.id == dstId && (!value \|\| edge.value == value);
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	336	});
				337	bool hasInEdge = llvm::any_of(inEdges[dstId], [=](Edge &edge) {
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	338	return edge.id == srcId && (!value \|\| edge.value == value);
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	339	});
				340	return hasOutEdge && hasInEdge;
				341	}
				342
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	343	// Adds an edge from node 'srcId' to node 'dstId' for 'value'.
				344	void addEdge(unsigned srcId, unsigned dstId, Value *value) {
				345	if (!hasEdge(srcId, dstId, value)) {
				346	outEdges[srcId].push_back({dstId, value});
				347	inEdges[dstId].push_back({srcId, value});
				348	if (value->getType().isa<MemRefType>())
				349	memrefEdgeCount[value]++;
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	350	}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	351	}
				352
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	353	// Removes an edge from node 'srcId' to node 'dstId' for 'value'.
				354	void removeEdge(unsigned srcId, unsigned dstId, Value *value) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	355	assert(inEdges.count(dstId) > 0);
				356	assert(outEdges.count(srcId) > 0);
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	357	if (value->getType().isa<MemRefType>()) {
				358	assert(memrefEdgeCount.count(value) > 0);
				359	memrefEdgeCount[value]--;
				360	}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	361	// Remove 'srcId' from 'inEdges[dstId]'.
				362	for (auto it = inEdges[dstId].begin(); it != inEdges[dstId].end(); ++it) {
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	363	if ((it).id == srcId && (it).value == value) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	364	inEdges[dstId].erase(it);
				365	break;
				366	}
				367	}
				368	// Remove 'dstId' from 'outEdges[srcId]'.
				369	for (auto it = outEdges[srcId].begin(); it != outEdges[srcId].end(); ++it) {
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	370	if ((it).id == dstId && (it).value == value) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	371	outEdges[srcId].erase(it);
				372	break;
				373	}
				374	}
				375	}
				376
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	377	// Returns true if there is a path in the dependence graph from node 'srcId'
				378	// to node 'dstId'. Returns false otherwise.
				379	bool hasDependencePath(unsigned srcId, unsigned dstId) {
				380	// Worklist state is: <node-id, next-output-edge-index-to-visit>
				381	SmallVector<std::pair<unsigned, unsigned>, 4> worklist;
				382	worklist.push_back({srcId, 0});
				383	// Run DFS traversal to see if 'dstId' is reachable from 'srcId'.
				384	while (!worklist.empty()) {
				385	auto &idAndIndex = worklist.back();
				386	// Return true if we have reached 'dstId'.
				387	if (idAndIndex.first == dstId)
				388	return true;
				389	// Pop and continue if node has no out edges, or if all out edges have
				390	// already been visited.
				391	if (outEdges.count(idAndIndex.first) == 0 \|\|
				392	idAndIndex.second == outEdges[idAndIndex.first].size()) {
				393	worklist.pop_back();
				394	continue;
				395	}
				396	// Get graph edge to traverse.
				397	Edge edge = outEdges[idAndIndex.first][idAndIndex.second];
				398	// Increment next output edge index for 'idAndIndex'.
				399	++idAndIndex.second;
				400	// Add node at 'edge.id' to worklist.
				401	worklist.push_back({edge.id, 0});
				402	}
				403	return false;
				404	}
				405
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	406	// Returns the input edge count for node 'id' and 'memref' from src nodes
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	407	// which access 'memref' with a store operation.
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	408	unsigned getIncomingMemRefAccesses(unsigned id, Value *memref) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	409	unsigned inEdgeCount = 0;
				410	if (inEdges.count(id) > 0)
				411	for (auto &inEdge : inEdges[id])
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	412	if (inEdge.value == memref) {
				413	Node *srcNode = getNode(inEdge.id);
				414	// Only count in edges from 'srcNode' if 'srcNode' accesses 'memref'
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	415	if (srcNode->getStoreOpCount(memref) > 0)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	416	++inEdgeCount;
				417	}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	418	return inEdgeCount;
				419	}
				420
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	421	// Returns the output edge count for node 'id' and 'memref' (if non-null),
				422	// otherwise returns the total output edge count from node 'id'.
				423	unsigned getOutEdgeCount(unsigned id, Value *memref = nullptr) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	424	unsigned outEdgeCount = 0;
				425	if (outEdges.count(id) > 0)
				426	for (auto &outEdge : outEdges[id])
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	427	if (!memref \|\| outEdge.value == memref)
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	428	++outEdgeCount;
				429	return outEdgeCount;
				430	}
				431
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	432	// Computes and returns an insertion point instruction, before which the
				433	// the fused <srcId, dstId> loop nest can be inserted while preserving
				434	// dependences. Returns nullptr if no such insertion point is found.
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	435	Instruction *getFusedLoopNestInsertionPoint(unsigned srcId, unsigned dstId) {
MLIR Team	5c5739d	2019-01-25 06:27:40	[diff] [blame]	436	if (outEdges.count(srcId) == 0)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	437	return getNode(dstId)->inst;
				438
				439	// Build set of insts in range (srcId, dstId) which depend on 'srcId'.
				440	SmallPtrSet<Instruction *, 2> srcDepInsts;
				441	for (auto &outEdge : outEdges[srcId])
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	442	if (outEdge.id != dstId)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	443	srcDepInsts.insert(getNode(outEdge.id)->inst);
				444
				445	// Build set of insts in range (srcId, dstId) on which 'dstId' depends.
				446	SmallPtrSet<Instruction *, 2> dstDepInsts;
				447	for (auto &inEdge : inEdges[dstId])
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	448	if (inEdge.id != srcId)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	449	dstDepInsts.insert(getNode(inEdge.id)->inst);
				450
				451	Instruction *srcNodeInst = getNode(srcId)->inst;
				452	Instruction *dstNodeInst = getNode(dstId)->inst;
				453
				454	// Computing insertion point:
				455	// *) Walk all instruction positions in Block instruction list in the
				456	// range (src, dst). For each instruction 'inst' visited in this search:
				457	// *) Store in 'firstSrcDepPos' the first position where 'inst' has a
				458	// dependence edge from 'srcNode'.
				459	// *) Store in 'lastDstDepPost' the last position where 'inst' has a
				460	// dependence edge to 'dstNode'.
				461	// *) Compare 'firstSrcDepPos' and 'lastDstDepPost' to determine the
				462	// instruction insertion point (or return null pointer if no such
				463	// insertion point exists: 'firstSrcDepPos' <= 'lastDstDepPos').
				464	SmallVector<Instruction *, 2> depInsts;
				465	Optional<unsigned> firstSrcDepPos;
				466	Optional<unsigned> lastDstDepPos;
				467	unsigned pos = 0;
				468	for (Block::iterator it = std::next(Block::iterator(srcNodeInst));
				469	it != Block::iterator(dstNodeInst); ++it) {
				470	Instruction inst = &(it);
				471	if (srcDepInsts.count(inst) > 0 && firstSrcDepPos == None)
				472	firstSrcDepPos = pos;
				473	if (dstDepInsts.count(inst) > 0)
				474	lastDstDepPos = pos;
				475	depInsts.push_back(inst);
				476	++pos;
MLIR Team	5c5739d	2019-01-25 06:27:40	[diff] [blame]	477	}
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	478
				479	if (firstSrcDepPos.hasValue()) {
				480	if (lastDstDepPos.hasValue()) {
				481	if (firstSrcDepPos.getValue() <= lastDstDepPos.getValue()) {
				482	// No valid insertion point exists which preserves dependences.
				483	return nullptr;
				484	}
				485	}
				486	// Return the insertion point at 'firstSrcDepPos'.
				487	return depInsts[firstSrcDepPos.getValue()];
				488	}
				489	// No dependence targets in range (or only dst deps in range), return
				490	// 'dstNodInst' insertion point.
				491	return dstNodeInst;
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	492	}
				493
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	494	// Updates edge mappings from node 'srcId' to node 'dstId' after 'oldMemRef'
				495	// has been replaced in node at 'dstId' by a private memref.
				496	void updateEdges(unsigned srcId, unsigned dstId, Value *oldMemRef) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	497	// For each edge in 'inEdges[srcId]': add new edge remaping to 'dstId'.
				498	if (inEdges.count(srcId) > 0) {
				499	SmallVector<Edge, 2> oldInEdges = inEdges[srcId];
				500	for (auto &inEdge : oldInEdges) {
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	501	// Add edge from 'inEdge.id' to 'dstId' if not for 'oldMemRef'.
				502	if (inEdge.value != oldMemRef)
				503	addEdge(inEdge.id, dstId, inEdge.value);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	504	}
				505	}
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	506	// For each edge in 'outEdges[srcId]': remove edge from 'srcId' to 'dstId'.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	507	if (outEdges.count(srcId) > 0) {
				508	SmallVector<Edge, 2> oldOutEdges = outEdges[srcId];
				509	for (auto &outEdge : oldOutEdges) {
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	510	// Remove any out edges from 'srcId' to 'dstId' across memrefs.
				511	if (outEdge.id == dstId)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	512	removeEdge(srcId, outEdge.id, outEdge.value);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	513	}
				514	}
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	515	// Remove any edges in 'inEdges[dstId]' on 'oldMemRef' (which is being
				516	// replaced by a private memref). These edges could come from nodes
				517	// other than 'srcId' which were removed in the previous step.
				518	if (inEdges.count(dstId) > 0) {
				519	SmallVector<Edge, 2> oldInEdges = inEdges[dstId];
				520	for (auto &inEdge : oldInEdges)
				521	if (inEdge.value == oldMemRef)
				522	removeEdge(inEdge.id, dstId, inEdge.value);
				523	}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	524	}
				525
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	526	// Update edge mappings for nodes 'sibId' and 'dstId' to reflect fusion
				527	// of sibling node 'sidId' into node 'dstId'.
				528	void updateEdges(unsigned sibId, unsigned dstId) {
				529	// For each edge in 'inEdges[sibId]':
				530	// *) Add new edge from source node 'inEdge.id' to 'dstNode'.
				531	// *) Remove edge from source node 'inEdge.id' to 'sibNode'.
				532	if (inEdges.count(sibId) > 0) {
				533	SmallVector<Edge, 2> oldInEdges = inEdges[sibId];
				534	for (auto &inEdge : oldInEdges) {
				535	addEdge(inEdge.id, dstId, inEdge.value);
				536	removeEdge(inEdge.id, sibId, inEdge.value);
				537	}
				538	}
				539
				540	// For each edge in 'outEdges[sibId]' to node 'id'
				541	// *) Add new edge from 'dstId' to 'outEdge.id'.
				542	// *) Remove edge from 'sibId' to 'outEdge.id'.
				543	if (outEdges.count(sibId) > 0) {
				544	SmallVector<Edge, 2> oldOutEdges = outEdges[sibId];
				545	for (auto &outEdge : oldOutEdges) {
				546	addEdge(dstId, outEdge.id, outEdge.value);
				547	removeEdge(sibId, outEdge.id, outEdge.value);
				548	}
				549	}
				550	}
				551
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	552	// Adds ops in 'loads' and 'stores' to node at 'id'.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	553	void addToNode(unsigned id, const SmallVectorImpl<Instruction *> &loads,
				554	const SmallVectorImpl<Instruction *> &stores) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	555	Node *node = getNode(id);
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	556	for (auto *loadOpInst : loads)
				557	node->loads.push_back(loadOpInst);
				558	for (auto *storeOpInst : stores)
				559	node->stores.push_back(storeOpInst);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	560	}
				561
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	562	void clearNodeLoadAndStores(unsigned id) {
				563	Node *node = getNode(id);
				564	node->loads.clear();
				565	node->stores.clear();
				566	}
				567
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	568	// Calls 'callback' for each input edge incident to node 'id' which carries a
				569	// memref dependence.
				570	void forEachMemRefInputEdge(unsigned id,
				571	const std::function<void(Edge)> &callback) {
				572	if (inEdges.count(id) > 0)
				573	forEachMemRefEdge(inEdges[id], callback);
				574	}
				575	// Calls 'callback' for each output edge from node 'id' which carries a
				576	// memref dependence.
				577	void forEachMemRefOutputEdge(unsigned id,
				578	const std::function<void(Edge)> &callback) {
				579	if (outEdges.count(id) > 0)
				580	forEachMemRefEdge(outEdges[id], callback);
				581	}
				582	// Calls 'callback' for each edge in 'edges' which carries a memref
				583	// dependence.
				584	void forEachMemRefEdge(ArrayRef<Edge> edges,
				585	const std::function<void(Edge)> &callback) {
				586	for (auto &edge : edges) {
				587	// Skip if 'edge' is not a memref dependence edge.
				588	if (!edge.value->getType().isa<MemRefType>())
				589	continue;
				590	assert(nodes.count(edge.id) > 0);
				591	// Skip if 'edge.id' is not a loop nest.
				592	if (!getNode(edge.id)->inst->isa<AffineForOp>())
				593	continue;
				594	// Visit current input edge 'edge'.
				595	callback(edge);
				596	}
				597	}
				598
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	599	void print(raw_ostream &os) const {
				600	os << "\nMemRefDependenceGraph\n";
				601	os << "\nNodes:\n";
				602	for (auto &idAndNode : nodes) {
				603	os << "Node: " << idAndNode.first << "\n";
				604	auto it = inEdges.find(idAndNode.first);
				605	if (it != inEdges.end()) {
				606	for (const auto &e : it->second)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	607	os << " InEdge: " << e.id << " " << e.value << "\n";
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	608	}
				609	it = outEdges.find(idAndNode.first);
				610	if (it != outEdges.end()) {
				611	for (const auto &e : it->second)
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	612	os << " OutEdge: " << e.id << " " << e.value << "\n";
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	613	}
				614	}
				615	}
				616	void dump() const { print(llvm::errs()); }
				617	};
				618
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	619	// Intializes the data dependence graph by walking instructions in 'f'.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	620	// Assigns each node in the graph a node id based on program order in 'f'.
Chris Lattner	315a466	2018-12-28 21:07:39	[diff] [blame]	621	// TODO(andydavis) Add support for taking a Block arg to construct the
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	622	// dependence graph at a different depth.
Chris Lattner	69d9e99	2018-12-28 16:48:09	[diff] [blame]	623	bool MemRefDependenceGraph::init(Function *f) {
Chris Lattner	3f19031	2018-12-27 22:35:10	[diff] [blame]	624	DenseMap<Value *, SetVector<unsigned>> memrefAccesses;
Chris Lattner	dffc589	2018-12-29 23:33:43	[diff] [blame]	625
				626	// TODO: support multi-block functions.
				627	if (f->getBlocks().size() != 1)
				628	return false;
				629
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	630	DenseMap<Instruction *, unsigned> forToNodeMap;
Chris Lattner	dffc589	2018-12-29 23:33:43	[diff] [blame]	631	for (auto &inst : f->front()) {
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	632	if (auto forOp = inst.dyn_cast<AffineForOp>()) {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	633	// Create graph node 'id' to represent top-level 'forOp' and record
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	634	// all loads and store accesses it contains.
				635	LoopNestStateCollector collector;
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	636	collector.collect(&inst);
Uday Bondhugula	4ba8c91	2019-02-07 05:54:18	[diff] [blame]	637	// Return false if a non 'for' region was found (not currently supported).
River Riddle	7555383	2019-01-29 05:23:53	[diff] [blame]	638	if (collector.hasNonForRegion)
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	639	return false;
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	640	Node node(nextNodeId++, &inst);
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	641	for (auto *opInst : collector.loadOpInsts) {
				642	node.loads.push_back(opInst);
				643	auto *memref = opInst->cast<LoadOp>()->getMemRef();
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	644	memrefAccesses[memref].insert(node.id);
				645	}
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	646	for (auto *opInst : collector.storeOpInsts) {
				647	node.stores.push_back(opInst);
				648	auto *memref = opInst->cast<StoreOp>()->getMemRef();
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	649	memrefAccesses[memref].insert(node.id);
				650	}
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	651	forToNodeMap[&inst] = node.id;
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	652	nodes.insert({node.id, node});
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	653	} else if (auto loadOp = inst.dyn_cast<LoadOp>()) {
				654	// Create graph node for top-level load op.
				655	Node node(nextNodeId++, &inst);
				656	node.loads.push_back(&inst);
				657	auto *memref = inst.cast<LoadOp>()->getMemRef();
				658	memrefAccesses[memref].insert(node.id);
				659	nodes.insert({node.id, node});
				660	} else if (auto storeOp = inst.dyn_cast<StoreOp>()) {
				661	// Create graph node for top-level store op.
				662	Node node(nextNodeId++, &inst);
				663	node.stores.push_back(&inst);
				664	auto *memref = inst.cast<StoreOp>()->getMemRef();
				665	memrefAccesses[memref].insert(node.id);
				666	nodes.insert({node.id, node});
				667	} else if (inst.getNumBlockLists() != 0) {
				668	// Return false if another region is found (not currently supported).
				669	return false;
				670	} else if (inst.getNumResults() > 0 && !inst.use_empty()) {
				671	// Create graph node for top-level producer of SSA values, which
				672	// could be used by loop nest nodes.
				673	Node node(nextNodeId++, &inst);
				674	nodes.insert({node.id, node});
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	675	}
				676	}
				677
				678	// Add dependence edges between nodes which produce SSA values and their
				679	// users.
				680	for (auto &idAndNode : nodes) {
				681	const Node &node = idAndNode.second;
				682	if (!node.loads.empty() \|\| !node.stores.empty())
				683	continue;
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	684	auto *opInst = node.inst;
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	685	for (auto *value : opInst->getResults()) {
				686	for (auto &use : value->getUses()) {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	687	SmallVector<OpPointer<AffineForOp>, 4> loops;
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	688	getLoopIVs(*use.getOwner(), &loops);
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	689	if (loops.empty())
				690	continue;
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	691	assert(forToNodeMap.count(loops[0]->getInstruction()) > 0);
				692	unsigned userLoopNestId = forToNodeMap[loops[0]->getInstruction()];
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	693	addEdge(node.id, userLoopNestId, value);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	694	}
				695	}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	696	}
				697
				698	// Walk memref access lists and add graph edges between dependent nodes.
				699	for (auto &memrefAndList : memrefAccesses) {
				700	unsigned n = memrefAndList.second.size();
				701	for (unsigned i = 0; i < n; ++i) {
				702	unsigned srcId = memrefAndList.second[i];
				703	bool srcHasStore =
				704	getNode(srcId)->getStoreOpCount(memrefAndList.first) > 0;
				705	for (unsigned j = i + 1; j < n; ++j) {
				706	unsigned dstId = memrefAndList.second[j];
				707	bool dstHasStore =
				708	getNode(dstId)->getStoreOpCount(memrefAndList.first) > 0;
				709	if (srcHasStore \|\| dstHasStore)
				710	addEdge(srcId, dstId, memrefAndList.first);
				711	}
				712	}
				713	}
				714	return true;
				715	}
				716
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	717	namespace {
				718
				719	// LoopNestStats aggregates various per-loop statistics (eg. loop trip count
				720	// and operation count) for a loop nest up until the innermost loop body.
				721	struct LoopNestStats {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	722	// Map from AffineForOp to immediate child AffineForOps in its loop body.
				723	DenseMap<Instruction *, SmallVector<OpPointer<AffineForOp>, 2>> loopMap;
				724	// Map from AffineForOp to count of operations in its loop body.
				725	DenseMap<Instruction *, uint64_t> opCountMap;
				726	// Map from AffineForOp to its constant trip count.
				727	DenseMap<Instruction *, uint64_t> tripCountMap;
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	728	};
				729
				730	// LoopNestStatsCollector walks a single loop nest and gathers per-loop
				731	// trip count and operation count statistics and records them in 'stats'.
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	732	struct LoopNestStatsCollector {
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	733	LoopNestStats *stats;
				734	bool hasLoopWithNonConstTripCount = false;
				735
				736	LoopNestStatsCollector(LoopNestStats *stats) : stats(stats) {}
				737
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	738	void collect(Instruction *inst) {
				739	inst->walk<AffineForOp>([&](OpPointer<AffineForOp> forOp) {
				740	auto *forInst = forOp->getInstruction();
				741	auto *parentInst = forOp->getInstruction()->getParentInst();
				742	if (parentInst != nullptr) {
				743	assert(parentInst->isa<AffineForOp>() && "Expected parent AffineForOp");
				744	// Add mapping to 'forOp' from its parent AffineForOp.
				745	stats->loopMap[parentInst].push_back(forOp);
				746	}
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	747
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	748	// Record the number of op instructions in the body of 'forOp'.
				749	unsigned count = 0;
				750	stats->opCountMap[forInst] = 0;
				751	for (auto &inst : *forOp->getBody()) {
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	752	if (!inst.isa<AffineForOp>() && !inst.isa<AffineIfOp>())
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	753	++count;
				754	}
				755	stats->opCountMap[forInst] = count;
				756	// Record trip count for 'forOp'. Set flag if trip count is not
				757	// constant.
				758	Optional<uint64_t> maybeConstTripCount = getConstantTripCount(forOp);
				759	if (!maybeConstTripCount.hasValue()) {
				760	hasLoopWithNonConstTripCount = true;
				761	return;
				762	}
				763	stats->tripCountMap[forInst] = maybeConstTripCount.getValue();
				764	});
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	765	}
				766	};
				767
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	768	// Computes the total cost of the loop nest rooted at 'forOp'.
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	769	// Currently, the total cost is computed by counting the total operation
				770	// instance count (i.e. total number of operations in the loop bodyloop
				771	// operation count * loop trip count) for the entire loop nest.
				772	// If 'tripCountOverrideMap' is non-null, overrides the trip count for loops
				773	// specified in the map when computing the total op instance count.
				774	// NOTE: this is used to compute the cost of computation slices, which are
				775	// sliced along the iteration dimension, and thus reduce the trip count.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	776	// If 'computeCostMap' is non-null, the total op count for forOps specified
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	777	// in the map is increased (not overridden) by adding the op count from the
				778	// map to the existing op count for the for loop. This is done before
				779	// multiplying by the loop's trip count, and is used to model the cost of
				780	// inserting a sliced loop nest of known cost into the loop's body.
				781	// NOTE: this is used to compute the cost of fusing a slice of some loop nest
				782	// within another loop.
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	783	static int64_t getComputeCost(
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	784	Instruction forInst, LoopNestStats stats,
				785	llvm::SmallDenseMap<Instruction , uint64_t, 8> tripCountOverrideMap,
				786	DenseMap<Instruction , int64_t> computeCostMap) {
				787	// 'opCount' is the total number operations in one iteration of 'forOp' body
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	788	int64_t opCount = stats->opCountMap[forInst];
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	789	if (stats->loopMap.count(forInst) > 0) {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	790	for (auto childForOp : stats->loopMap[forInst]) {
				791	opCount += getComputeCost(childForOp->getInstruction(), stats,
				792	tripCountOverrideMap, computeCostMap);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	793	}
				794	}
				795	// Add in additional op instances from slice (if specified in map).
				796	if (computeCostMap != nullptr) {
				797	auto it = computeCostMap->find(forInst);
				798	if (it != computeCostMap->end()) {
				799	opCount += it->second;
				800	}
				801	}
				802	// Override trip count (if specified in map).
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	803	int64_t tripCount = stats->tripCountMap[forInst];
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	804	if (tripCountOverrideMap != nullptr) {
				805	auto it = tripCountOverrideMap->find(forInst);
				806	if (it != tripCountOverrideMap->end()) {
				807	tripCount = it->second;
				808	}
				809	}
				810	// Returns the total number of dynamic instances of operations in loop body.
				811	return tripCount * opCount;
				812	}
				813
				814	} // end anonymous namespace
				815
Uday Bondhugula	7aa60a3	2019-02-27 01:32:47	[diff] [blame]	816	// TODO(andydavis,b/126426796): extend this to handle multiple result maps.
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	817	static Optional<uint64_t> getConstDifference(AffineMap lbMap, AffineMap ubMap) {
Uday Bondhugula	c1ca23e	2019-01-16 21:13:00	[diff] [blame]	818	assert(lbMap.getNumResults() == 1 && "expected single result bound map");
				819	assert(ubMap.getNumResults() == 1 && "expected single result bound map");
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	820	assert(lbMap.getNumDims() == ubMap.getNumDims());
				821	assert(lbMap.getNumSymbols() == ubMap.getNumSymbols());
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	822	AffineExpr lbExpr(lbMap.getResult(0));
				823	AffineExpr ubExpr(ubMap.getResult(0));
				824	auto loopSpanExpr = simplifyAffineExpr(ubExpr - lbExpr, lbMap.getNumDims(),
				825	lbMap.getNumSymbols());
				826	auto cExpr = loopSpanExpr.dyn_cast<AffineConstantExpr>();
				827	if (!cExpr)
				828	return None;
				829	return cExpr.getValue();
				830	}
				831
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	832	// Builds a map 'tripCountMap' from AffineForOp to constant trip count for loop
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	833	// nest surrounding 'srcAccess' utilizing slice loop bounds in 'sliceState'.
				834	// Returns true on success, false otherwise (if a non-constant trip count
				835	// was encountered).
				836	// TODO(andydavis) Make this work with non-unit step loops.
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	837	static bool buildSliceTripCountMap(
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	838	Instruction srcOpInst, ComputationSliceState sliceState,
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	839	llvm::SmallDenseMap<Instruction , uint64_t, 8> tripCountMap) {
				840	SmallVector<OpPointer<AffineForOp>, 4> srcLoopIVs;
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	841	getLoopIVs(*srcOpInst, &srcLoopIVs);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	842	unsigned numSrcLoopIVs = srcLoopIVs.size();
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	843	// Populate map from AffineForOp -> trip count
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	844	for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
				845	AffineMap lbMap = sliceState->lbs[i];
				846	AffineMap ubMap = sliceState->ubs[i];
Nicolas Vasilache	0e7a8a9	2019-01-26 18:41:17	[diff] [blame]	847	if (lbMap == AffineMap() \|\| ubMap == AffineMap()) {
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	848	// The iteration of src loop IV 'i' was not sliced. Use full loop bounds.
				849	if (srcLoopIVs[i]->hasConstantLowerBound() &&
				850	srcLoopIVs[i]->hasConstantUpperBound()) {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	851	(*tripCountMap)[srcLoopIVs[i]->getInstruction()] =
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	852	srcLoopIVs[i]->getConstantUpperBound() -
				853	srcLoopIVs[i]->getConstantLowerBound();
				854	continue;
				855	}
				856	return false;
				857	}
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	858	Optional<uint64_t> tripCount = getConstDifference(lbMap, ubMap);
				859	if (!tripCount.hasValue())
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	860	return false;
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	861	(*tripCountMap)[srcLoopIVs[i]->getInstruction()] = tripCount.getValue();
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	862	}
				863	return true;
				864	}
				865
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	866	// Removes load operations from 'srcLoads' which operate on 'memref', and
				867	// adds them to 'dstLoads'.
				868	static void
				869	moveLoadsAccessingMemrefTo(Value *memref,
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	870	SmallVectorImpl<Instruction > srcLoads,
				871	SmallVectorImpl<Instruction > dstLoads) {
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	872	dstLoads->clear();
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	873	SmallVector<Instruction *, 4> srcLoadsToKeep;
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	874	for (auto load : srcLoads) {
				875	if (load->cast<LoadOp>()->getMemRef() == memref)
				876	dstLoads->push_back(load);
				877	else
				878	srcLoadsToKeep.push_back(load);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	879	}
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	880	srcLoads->swap(srcLoadsToKeep);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	881	}
				882
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	883	// Returns the innermost common loop depth for the set of operations in 'ops'.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	884	static unsigned getInnermostCommonLoopDepth(ArrayRef<Instruction *> ops) {
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	885	unsigned numOps = ops.size();
				886	assert(numOps > 0);
				887
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	888	std::vector<SmallVector<OpPointer<AffineForOp>, 4>> loops(numOps);
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	889	unsigned loopDepthLimit = std::numeric_limits<unsigned>::max();
				890	for (unsigned i = 0; i < numOps; ++i) {
				891	getLoopIVs(*ops[i], &loops[i]);
				892	loopDepthLimit =
				893	std::min(loopDepthLimit, static_cast<unsigned>(loops[i].size()));
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	894	}
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	895
				896	unsigned loopDepth = 0;
				897	for (unsigned d = 0; d < loopDepthLimit; ++d) {
				898	unsigned i;
				899	for (i = 1; i < numOps; ++i) {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	900	if (loops[i - 1][d] != loops[i][d])
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	901	break;
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	902	}
				903	if (i != numOps)
				904	break;
				905	++loopDepth;
				906	}
				907	return loopDepth;
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	908	}
				909
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	910	// Returns the maximum loop depth at which no dependences between 'loadOpInsts'
				911	// and 'storeOpInsts' are satisfied.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	912	static unsigned getMaxLoopDepth(ArrayRef<Instruction *> loadOpInsts,
				913	ArrayRef<Instruction *> storeOpInsts) {
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	914	// Merge loads and stores into the same array.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	915	SmallVector<Instruction *, 2> ops(loadOpInsts.begin(), loadOpInsts.end());
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	916	ops.append(storeOpInsts.begin(), storeOpInsts.end());
				917
				918	// Compute the innermost common loop depth for loads and stores.
				919	unsigned loopDepth = getInnermostCommonLoopDepth(ops);
				920
				921	// Return common loop depth for loads if there are no store ops.
				922	if (storeOpInsts.empty())
				923	return loopDepth;
				924
				925	// Check dependences on all pairs of ops in 'ops' and store the minimum
				926	// loop depth at which a dependence is satisfied.
				927	for (unsigned i = 0, e = ops.size(); i < e; ++i) {
				928	auto *srcOpInst = ops[i];
				929	MemRefAccess srcAccess(srcOpInst);
				930	for (unsigned j = 0; j < e; ++j) {
				931	auto *dstOpInst = ops[j];
				932	MemRefAccess dstAccess(dstOpInst);
				933
				934	unsigned numCommonLoops =
				935	getNumCommonSurroundingLoops(srcOpInst, dstOpInst);
				936	for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
				937	FlatAffineConstraints dependenceConstraints;
				938	// TODO(andydavis) Cache dependence analysis results, check cache here.
				939	if (checkMemrefAccessDependence(srcAccess, dstAccess, d,
				940	&dependenceConstraints,
				941	/dependenceComponents=/nullptr)) {
				942	// Store minimum loop depth and break because we want the min 'd' at
				943	// which there is a dependence.
				944	loopDepth = std::min(loopDepth, d - 1);
				945	break;
				946	}
				947	}
				948	}
				949	}
				950	return loopDepth;
				951	}
				952
MLIR Team	8f5f2c7	2019-02-15 17:32:18	[diff] [blame]	953	// Compute loop interchange permutation:
				954	// *) Computes dependence components between all op pairs in 'ops' for loop
				955	// depths in range [1, 'maxLoopDepth'].
				956	// *) Classifies the outermost 'maxLoopDepth' loops surrounding 'ops' as either
				957	// parallel or sequential.
				958	// *) Computes the loop permutation which sinks sequential loops deeper into
				959	// the loop nest, while preserving the relative order between other loops.
				960	// *) Checks each dependence component against the permutation to see if the
				961	// desired loop interchange would violated dependences by making the a
				962	// dependence componenent lexicographically negative.
				963	// TODO(andydavis) Move this function to LoopUtils.
				964	static bool
				965	computeLoopInterchangePermutation(ArrayRef<Instruction *> ops,
				966	unsigned maxLoopDepth,
				967	SmallVectorImpl<unsigned> *loopPermMap) {
				968	// Gather dependence components for dependences between all ops in 'ops'
				969	// at loop depths in range [1, maxLoopDepth].
				970	// TODO(andydavis) Refactor this loop into a LoopUtil utility function:
				971	// mlir::getDependenceComponents().
				972	// TODO(andydavis) Split this loop into two: first check all dependences,
				973	// and construct dep vectors. Then, scan through them to detect the parallel
				974	// ones.
				975	std::vector<llvm::SmallVector<DependenceComponent, 2>> depCompsVec;
				976	llvm::SmallVector<bool, 8> isParallelLoop(maxLoopDepth, true);
				977	unsigned numOps = ops.size();
				978	for (unsigned d = 1; d <= maxLoopDepth; ++d) {
				979	for (unsigned i = 0; i < numOps; ++i) {
				980	auto *srcOpInst = ops[i];
				981	MemRefAccess srcAccess(srcOpInst);
				982	for (unsigned j = 0; j < numOps; ++j) {
				983	auto *dstOpInst = ops[j];
				984	MemRefAccess dstAccess(dstOpInst);
				985
				986	FlatAffineConstraints dependenceConstraints;
				987	llvm::SmallVector<DependenceComponent, 2> depComps;
				988	// TODO(andydavis,bondhugula) Explore whether it would be profitable
				989	// to pre-compute and store deps instead of repeatidly checking.
				990	if (checkMemrefAccessDependence(srcAccess, dstAccess, d,
				991	&dependenceConstraints, &depComps)) {
				992	isParallelLoop[d - 1] = false;
				993	depCompsVec.push_back(depComps);
				994	}
				995	}
				996	}
				997	}
				998	// Count the number of parallel loops.
				999	unsigned numParallelLoops = 0;
				1000	for (unsigned i = 0, e = isParallelLoop.size(); i < e; ++i)
				1001	if (isParallelLoop[i])
				1002	++numParallelLoops;
				1003
				1004	// Compute permutation of loops that sinks sequential loops (and thus raises
				1005	// parallel loops) while preserving relative order.
				1006	llvm::SmallVector<unsigned, 4> loopPermMapInv;
				1007	loopPermMapInv.resize(maxLoopDepth);
				1008	loopPermMap->resize(maxLoopDepth);
				1009	unsigned nextSequentialLoop = numParallelLoops;
				1010	unsigned nextParallelLoop = 0;
				1011	for (unsigned i = 0; i < maxLoopDepth; ++i) {
				1012	if (isParallelLoop[i]) {
				1013	(*loopPermMap)[i] = nextParallelLoop;
				1014	loopPermMapInv[nextParallelLoop++] = i;
				1015	} else {
				1016	(*loopPermMap)[i] = nextSequentialLoop;
				1017	loopPermMapInv[nextSequentialLoop++] = i;
				1018	}
				1019	}
				1020
				1021	// Check each dependence component against the permutation to see if the
				1022	// desired loop interchange permutation would make the dependence vectors
				1023	// lexicographically negative.
				1024	// Example 1: [-1, 1][0, 0]
				1025	// Example 2: [0, 0][-1, 1]
				1026	for (unsigned i = 0, e = depCompsVec.size(); i < e; ++i) {
				1027	llvm::SmallVector<DependenceComponent, 2> &depComps = depCompsVec[i];
				1028	assert(depComps.size() >= maxLoopDepth);
				1029	// Check if the first non-zero dependence component is positive.
				1030	for (unsigned j = 0; j < maxLoopDepth; ++j) {
				1031	unsigned permIndex = loopPermMapInv[j];
				1032	assert(depComps[permIndex].lb.hasValue());
				1033	int64_t depCompLb = depComps[permIndex].lb.getValue();
				1034	if (depCompLb > 0)
				1035	break;
				1036	if (depCompLb < 0)
				1037	return false;
				1038	}
				1039	}
				1040	return true;
				1041	}
				1042
				1043	// Sinks all sequential loops to the innermost levels (while preserving
				1044	// relative order among them) and moves all parallel loops to the
				1045	// outermost (while again preserving relative order among them).
				1046	// This can increase the loop depth at which we can fuse a slice, since we are
				1047	// pushing loop carried dependence to a greater depth in the loop nest.
				1048	static void sinkSequentialLoops(MemRefDependenceGraph::Node *node) {
				1049	assert(node->inst->isa<AffineForOp>());
				1050	// Get perfectly nested sequence of loops starting at root of loop nest.
				1051	// TODO(andydavis,bondhugula) Share this with similar code in loop tiling.
				1052	SmallVector<OpPointer<AffineForOp>, 4> loops;
				1053	OpPointer<AffineForOp> curr = node->inst->cast<AffineForOp>();
				1054	loops.push_back(curr);
				1055	auto *currBody = curr->getBody();
				1056	while (!currBody->empty() &&
				1057	std::next(currBody->begin()) == currBody->end() &&
				1058	(curr = curr->getBody()->front().dyn_cast<AffineForOp>())) {
				1059	loops.push_back(curr);
				1060	currBody = curr->getBody();
				1061	}
				1062	if (loops.size() < 2)
				1063	return;
				1064
				1065	// Merge loads and stores into the same array.
				1066	SmallVector<Instruction *, 2> memOps(node->loads.begin(), node->loads.end());
				1067	memOps.append(node->stores.begin(), node->stores.end());
				1068
				1069	// Compute loop permutation in 'loopPermMap'.
				1070	llvm::SmallVector<unsigned, 4> loopPermMap;
				1071	if (!computeLoopInterchangePermutation(memOps, loops.size(), &loopPermMap))
				1072	return;
				1073
				1074	int loopNestRootIndex = -1;
				1075	for (int i = loops.size() - 1; i >= 0; --i) {
				1076	int permIndex = static_cast<int>(loopPermMap[i]);
				1077	// Store the index of the for loop which will be the new loop nest root.
				1078	if (permIndex == 0)
				1079	loopNestRootIndex = i;
				1080	if (permIndex > i) {
				1081	// Sink loop 'i' by 'permIndex - i' levels deeper into the loop nest.
				1082	sinkLoop(loops[i], permIndex - i);
				1083	}
				1084	}
				1085	assert(loopNestRootIndex != -1 && "invalid root index");
				1086	node->inst = loops[loopNestRootIndex]->getInstruction();
				1087	}
				1088
Uday Bondhugula	c1ca23e	2019-01-16 21:13:00	[diff] [blame]	1089	// Returns the slice union of 'sliceStateA' and 'sliceStateB' in 'sliceStateB'
				1090	// using a rectangular bounding box.
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1091	// TODO(andydavis) This function assumes that lower bounds for 'sliceStateA'
				1092	// and 'sliceStateB' are aligned.
				1093	// Specifically, when taking the union of overlapping intervals, it assumes
				1094	// that both intervals start at zero. Support needs to be added to take into
				1095	// account interval start offset when computing the union.
				1096	// TODO(andydavis) Move this function to an analysis library.
Uday Bondhugula	c1ca23e	2019-01-16 21:13:00	[diff] [blame]	1097	static bool getSliceUnion(const ComputationSliceState &sliceStateA,
				1098	ComputationSliceState *sliceStateB) {
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1099	assert(sliceStateA.lbs.size() == sliceStateB->lbs.size());
				1100	assert(sliceStateA.ubs.size() == sliceStateB->ubs.size());
				1101
				1102	for (unsigned i = 0, e = sliceStateA.lbs.size(); i < e; ++i) {
				1103	AffineMap lbMapA = sliceStateA.lbs[i];
				1104	AffineMap ubMapA = sliceStateA.ubs[i];
Nicolas Vasilache	0e7a8a9	2019-01-26 18:41:17	[diff] [blame]	1105	if (lbMapA == AffineMap()) {
				1106	assert(ubMapA == AffineMap());
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1107	continue;
				1108	}
Uday Bondhugula	c1ca23e	2019-01-16 21:13:00	[diff] [blame]	1109	assert(ubMapA && "expected non-null ub map");
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1110
				1111	AffineMap lbMapB = sliceStateB->lbs[i];
				1112	AffineMap ubMapB = sliceStateB->ubs[i];
Nicolas Vasilache	0e7a8a9	2019-01-26 18:41:17	[diff] [blame]	1113	if (lbMapB == AffineMap()) {
				1114	assert(ubMapB == AffineMap());
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1115	// Union 'sliceStateB' does not have a bound for 'i' so copy from A.
				1116	sliceStateB->lbs[i] = lbMapA;
				1117	sliceStateB->ubs[i] = ubMapA;
				1118	continue;
				1119	}
Uday Bondhugula	c1ca23e	2019-01-16 21:13:00	[diff] [blame]	1120
				1121	// TODO(andydavis) Change this code to take the min across all lower bounds
				1122	// and max across all upper bounds for each dimension. This code can for
				1123	// cases where a unique min or max could not be statically determined.
				1124
				1125	// Assumption: both lower bounds are the same.
				1126	if (lbMapA != lbMapB)
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1127	return false;
				1128
				1129	// Add bound with the largest trip count to union.
				1130	Optional<uint64_t> tripCountA = getConstDifference(lbMapA, ubMapA);
				1131	Optional<uint64_t> tripCountB = getConstDifference(lbMapB, ubMapB);
				1132	if (!tripCountA.hasValue() \|\| !tripCountB.hasValue())
				1133	return false;
Uday Bondhugula	c1ca23e	2019-01-16 21:13:00	[diff] [blame]	1134
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1135	if (tripCountA.getValue() > tripCountB.getValue()) {
				1136	sliceStateB->lbs[i] = lbMapA;
				1137	sliceStateB->ubs[i] = ubMapA;
				1138	}
				1139	}
				1140	return true;
				1141	}
				1142
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	1143	// TODO(mlir-team): improve/complete this when we have target data.
				1144	unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
				1145	auto elementType = memRefType.getElementType();
				1146
				1147	unsigned sizeInBits;
				1148	if (elementType.isIntOrFloat()) {
				1149	sizeInBits = elementType.getIntOrFloatBitWidth();
				1150	} else {
				1151	auto vectorType = elementType.cast<VectorType>();
				1152	sizeInBits =
				1153	vectorType.getElementTypeBitWidth() * vectorType.getNumElements();
				1154	}
				1155	return llvm::divideCeil(sizeInBits, 8);
				1156	}
				1157
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1158	// Creates and returns a private (single-user) memref for fused loop rooted
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1159	// at 'forOp', with (potentially reduced) memref size based on the
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1160	// MemRefRegion written to by 'srcStoreOpInst' at depth 'dstLoopDepth'.
				1161	// TODO(bondhugula): consider refactoring the common code from generateDma and
				1162	// this one.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1163	static Value *createPrivateMemRef(OpPointer<AffineForOp> forOp,
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1164	Instruction *srcStoreOpInst,
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	1165	unsigned dstLoopDepth,
				1166	Optional<unsigned> fastMemorySpace,
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	1167	uint64_t localBufSizeThreshold) {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1168	auto *forInst = forOp->getInstruction();
				1169
				1170	// Create builder to insert alloc op just before 'forOp'.
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1171	FuncBuilder b(forInst);
				1172	// Builder to create constants at the top level.
				1173	FuncBuilder top(forInst->getFunction());
				1174	// Create new memref type based on slice bounds.
				1175	auto *oldMemRef = srcStoreOpInst->cast<StoreOp>()->getMemRef();
				1176	auto oldMemRefType = oldMemRef->getType().cast<MemRefType>();
				1177	unsigned rank = oldMemRefType.getRank();
				1178
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1179	// Compute MemRefRegion for 'srcStoreOpInst' at depth 'dstLoopDepth'.
Uday Bondhugula	0f50414	2019-02-04 21:48:44	[diff] [blame]	1180	MemRefRegion region(srcStoreOpInst->getLoc());
				1181	region.compute(srcStoreOpInst, dstLoopDepth);
River Riddle	6859f33	2019-01-23 22:39:45	[diff] [blame]	1182	SmallVector<int64_t, 4> newShape;
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1183	std::vector<SmallVector<int64_t, 4>> lbs;
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1184	SmallVector<int64_t, 8> lbDivisors;
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1185	lbs.reserve(rank);
				1186	// Query 'region' for 'newShape' and lower bounds of MemRefRegion accessed
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1187	// by 'srcStoreOpInst' at depth 'dstLoopDepth'.
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1188	Optional<int64_t> numElements =
Uday Bondhugula	0f50414	2019-02-04 21:48:44	[diff] [blame]	1189	region.getConstantBoundingSizeAndShape(&newShape, &lbs, &lbDivisors);
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	1190	assert(numElements.hasValue() &&
				1191	"non-constant number of elts in local buffer");
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1192
Uday Bondhugula	0f50414	2019-02-04 21:48:44	[diff] [blame]	1193	const FlatAffineConstraints *cst = region.getConstraints();
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1194	// 'outerIVs' holds the values that this memory region is symbolic/paramteric
				1195	// on; this would correspond to loop IVs surrounding the level at which the
				1196	// slice is being materialized.
				1197	SmallVector<Value *, 8> outerIVs;
				1198	cst->getIdValues(rank, cst->getNumIds(), &outerIVs);
				1199
				1200	// Build 'rank' AffineExprs from MemRefRegion 'lbs'
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1201	SmallVector<AffineExpr, 4> offsets;
				1202	offsets.reserve(rank);
				1203	for (unsigned d = 0; d < rank; ++d) {
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1204	assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
				1205
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1206	AffineExpr offset = top.getAffineConstantExpr(0);
				1207	for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
				1208	offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
				1209	}
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1210	assert(lbDivisors[d] > 0);
				1211	offset =
				1212	(offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1213	offsets.push_back(offset);
				1214	}
				1215
				1216	// Create 'newMemRefType' using 'newShape' from MemRefRegion accessed
				1217	// by 'srcStoreOpInst'.
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	1218	uint64_t bufSize =
				1219	getMemRefEltSizeInBytes(oldMemRefType) * numElements.getValue();
				1220	unsigned newMemSpace;
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	1221	if (bufSize <= localBufSizeThreshold && fastMemorySpace.hasValue()) {
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	1222	newMemSpace = fastMemorySpace.getValue();
				1223	} else {
				1224	newMemSpace = oldMemRefType.getMemorySpace();
				1225	}
				1226	auto newMemRefType = top.getMemRefType(
				1227	newShape, oldMemRefType.getElementType(), {}, newMemSpace);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1228	// Gather alloc operands for the dynamic dimensions of the memref.
				1229	SmallVector<Value *, 4> allocOperands;
				1230	unsigned dynamicDimCount = 0;
				1231	for (auto dimSize : oldMemRefType.getShape()) {
				1232	if (dimSize == -1)
				1233	allocOperands.push_back(
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1234	top.create<DimOp>(forOp->getLoc(), oldMemRef, dynamicDimCount++));
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1235	}
				1236
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1237	// Create new private memref for fused loop 'forOp'.
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1238	// TODO(andydavis) Create/move alloc ops for private memrefs closer to their
				1239	// consumer loop nests to reduce their live range. Currently they are added
				1240	// at the beginning of the function, because loop nests can be reordered
				1241	// during the fusion pass.
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1242	Value *newMemRef =
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1243	top.create<AllocOp>(forOp->getLoc(), newMemRefType, allocOperands);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1244
				1245	// Build an AffineMap to remap access functions based on lower bound offsets.
				1246	SmallVector<AffineExpr, 4> remapExprs;
				1247	remapExprs.reserve(rank);
				1248	unsigned zeroOffsetCount = 0;
				1249	for (unsigned i = 0; i < rank; i++) {
				1250	if (auto constExpr = offsets[i].dyn_cast<AffineConstantExpr>())
				1251	if (constExpr.getValue() == 0)
				1252	++zeroOffsetCount;
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1253	auto dimExpr = b.getAffineDimExpr(outerIVs.size() + i);
				1254
				1255	auto remapExpr =
				1256	simplifyAffineExpr(dimExpr - offsets[i], outerIVs.size() + rank, 0);
				1257	remapExprs.push_back(remapExpr);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1258	}
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1259	auto indexRemap =
				1260	zeroOffsetCount == rank
Nicolas Vasilache	0e7a8a9	2019-01-26 18:41:17	[diff] [blame]	1261	? AffineMap()
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1262	: b.getAffineMap(outerIVs.size() + rank, 0, remapExprs, {});
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1263	// Replace all users of 'oldMemRef' with 'newMemRef'.
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1264	bool ret =
				1265	replaceAllMemRefUsesWith(oldMemRef, newMemRef, {}, indexRemap,
				1266	/extraOperands=/outerIVs,
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1267	/domInstFilter=/&*forOp->getBody()->begin());
Uday Bondhugula	94a03f8	2019-01-22 21:58:52	[diff] [blame]	1268	assert(ret && "replaceAllMemrefUsesWith should always succeed here");
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	1269	(void)ret;
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1270	return newMemRef;
				1271	}
				1272
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1273	// Does the slice have a single iteration?
				1274	static uint64_t getSliceIterationCount(
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1275	const llvm::SmallDenseMap<Instruction *, uint64_t, 8> &sliceTripCountMap) {
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1276	uint64_t iterCount = 1;
				1277	for (const auto &count : sliceTripCountMap) {
				1278	iterCount *= count.second;
				1279	}
				1280	return iterCount;
				1281	}
				1282
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	1283	// Checks if node 'srcId' (which writes to a live out memref), can be safely
				1284	// fused into node 'dstId'. Returns true if the following conditions are met:
				1285	// *) 'srcNode' writes only writes to live out 'memref'.
				1286	// *) 'srcNode' has exaclty one output edge on 'memref' (which is to 'dstId').
				1287	// *) 'dstNode' does write to 'memref'.
				1288	// *) 'dstNode's write region to 'memref' is a super set of 'srcNode's write
				1289	// region to 'memref'.
				1290	// TODO(andydavis) Generalize this to handle more live in/out cases.
				1291	static bool canFuseSrcWhichWritesToLiveOut(unsigned srcId, unsigned dstId,
				1292	Value *memref,
				1293	MemRefDependenceGraph *mdg) {
				1294	auto *srcNode = mdg->getNode(srcId);
				1295	auto *dstNode = mdg->getNode(dstId);
				1296
				1297	// Return false if any of the following are true:
				1298	// *) 'srcNode' writes to a live in/out memref other than 'memref'.
				1299	// *) 'srcNode' has more than one output edge on 'memref'.
				1300	// *) 'dstNode' does not write to 'memref'.
				1301	if (srcNode->getStoreOpCount(memref) != 1 \|\|
				1302	mdg->getOutEdgeCount(srcNode->id, memref) != 1 \|\|
				1303	dstNode->getStoreOpCount(memref) == 0)
				1304	return false;
				1305	// Compute MemRefRegion 'srcWriteRegion' for 'srcStoreOpInst' on 'memref'.
				1306	auto *srcStoreOpInst = srcNode->stores.front();
				1307	MemRefRegion srcWriteRegion(srcStoreOpInst->getLoc());
				1308	srcWriteRegion.compute(srcStoreOpInst, /loopDepth=/0);
				1309	SmallVector<int64_t, 4> srcShape;
				1310	// Query 'srcWriteRegion' for 'srcShape' and 'srcNumElements'.
				1311	// by 'srcStoreOpInst' at depth 'dstLoopDepth'.
				1312	Optional<int64_t> srcNumElements =
				1313	srcWriteRegion.getConstantBoundingSizeAndShape(&srcShape);
				1314	if (!srcNumElements.hasValue())
				1315	return false;
				1316
				1317	// Compute MemRefRegion 'dstWriteRegion' for 'dstStoreOpInst' on 'memref'.
				1318	SmallVector<Instruction *, 2> dstStoreOps;
				1319	dstNode->getStoreOpsForMemref(memref, &dstStoreOps);
				1320	assert(dstStoreOps.size() == 1);
				1321	auto *dstStoreOpInst = dstStoreOps[0];
				1322	MemRefRegion dstWriteRegion(dstStoreOpInst->getLoc());
				1323	dstWriteRegion.compute(dstStoreOpInst, /loopDepth=/0);
				1324	SmallVector<int64_t, 4> dstShape;
				1325	// Query 'dstWriteRegion' for 'dstShape' and 'dstNumElements'.
				1326	// by 'dstStoreOpInst' at depth 'dstLoopDepth'.
				1327	Optional<int64_t> dstNumElements =
				1328	dstWriteRegion.getConstantBoundingSizeAndShape(&dstShape);
				1329	if (!dstNumElements.hasValue())
				1330	return false;
				1331
				1332	// Return false if write region is not a superset of 'srcNodes' write
				1333	// region to 'memref'.
				1334	// TODO(andydavis) Check the shape and lower bounds here too.
				1335	if (srcNumElements != dstNumElements)
				1336	return false;
				1337	return true;
				1338	}
				1339
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1340	// Checks the profitability of fusing a backwards slice of the loop nest
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1341	// surrounding 'srcOpInst' into the loop nest surrounding 'dstLoadOpInsts'.
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1342	// The argument 'srcStoreOpInst' is used to calculate the storage reduction on
				1343	// the memref being produced and consumed, which is an input to the cost model.
				1344	// For producer-constumer fusion, 'srcStoreOpInst' will be the same as
				1345	// 'srcOpInst', as we are slicing w.r.t to that producer.
				1346	// For input-reuse fusion, 'srcOpInst' will be the src loop nest LoadOp which
				1347	// reads from the same memref as dst loop nest load ops, and 'srcStoreOpInst'
				1348	// will be the unique store op in the src node, which will be used to check
				1349	// that the write region is the same after input-reuse fusion.
Uday Bondhugula	b4a1443	2019-01-26 00:00:50	[diff] [blame]	1350	// Returns true if it is profitable to fuse the candidate loop nests. Returns
				1351	// false otherwise. `dstLoopDepth` is set to the most profitable depth at which
				1352	// to materialize the source loop nest slice.
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1353	// The profitability model executes the following steps:
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1354	// *) Computes the backward computation slice at 'srcOpInst'. This
				1355	// computation slice of the loop nest surrounding 'srcOpInst' is
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1356	// represented by modified src loop bounds in 'sliceState', which are
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1357	// functions of loop IVs in the loop nest surrounding 'srcOpInst'.
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1358	// *) Computes the cost of unfused src/dst loop nests (currently the cost of a
				1359	// loop nest is the total number of dynamic operation instances in the loop
				1360	// nest).
				1361	// *) Computes the cost of fusing a slice of the src loop nest into the dst
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1362	// loop nest at various values of dst loop depth, attempting to fuse
				1363	// the largest compution slice at the maximal dst loop depth (closest to the
				1364	// load) to minimize reuse distance and potentially enable subsequent
				1365	// load/store forwarding.
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1366	// NOTE: If the dst loop nest includes multiple loads in 'dstLoadOpInsts' for
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1367	// the same memref as is written by 'srcOpInst', then the union of slice
				1368	// loop bounds is used to compute the slice and associated slice cost.
Uday Bondhugula	b4a1443	2019-01-26 00:00:50	[diff] [blame]	1369	// NOTE: 'dstLoopDepth' refers to the loop depth within the destination loop
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1370	// nest, at which the src computation slice is inserted/fused.
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1371	// NOTE: We attempt to maximize the dst loop depth, but there are cases
				1372	// where a particular setting for 'dstLoopNest' might fuse an unsliced
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1373	// loop (within the src computation slice) at a depth which results in
				1374	// execessive recomputation (see unit tests for examples).
				1375	// *) Compares the total cost of the unfused loop nests to the min cost fused
				1376	// loop nest computed in the previous step, and returns true if the latter
				1377	// is lower.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1378	static bool isFusionProfitable(Instruction *srcOpInst,
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1379	Instruction *srcStoreOpInst,
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1380	ArrayRef<Instruction *> dstLoadOpInsts,
				1381	ArrayRef<Instruction *> dstStoreOpInsts,
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1382	ComputationSliceState *sliceState,
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1383	unsigned *dstLoopDepth) {
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1384	LLVM_DEBUG({
				1385	llvm::dbgs() << "Checking whether fusion is profitable between:\n";
Uday Bondhugula	a1dad3a	2019-02-20 02:17:19	[diff] [blame]	1386	llvm::dbgs() << " " << *srcOpInst << " and \n";
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1387	for (auto dstOpInst : dstLoadOpInsts) {
Uday Bondhugula	a1dad3a	2019-02-20 02:17:19	[diff] [blame]	1388	llvm::dbgs() << " " << *dstOpInst << "\n";
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1389	};
				1390	});
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1391
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1392	// Compute cost of sliced and unsliced src loop nest.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1393	SmallVector<OpPointer<AffineForOp>, 4> srcLoopIVs;
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1394	getLoopIVs(*srcOpInst, &srcLoopIVs);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1395	unsigned numSrcLoopIVs = srcLoopIVs.size();
				1396
				1397	// Walk src loop nest and collect stats.
				1398	LoopNestStats srcLoopNestStats;
				1399	LoopNestStatsCollector srcStatsCollector(&srcLoopNestStats);
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	1400	srcStatsCollector.collect(srcLoopIVs[0]->getInstruction());
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1401	// Currently only constant trip count loop nests are supported.
				1402	if (srcStatsCollector.hasLoopWithNonConstTripCount)
				1403	return false;
				1404
				1405	// Compute cost of dst loop nest.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1406	SmallVector<OpPointer<AffineForOp>, 4> dstLoopIVs;
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1407	getLoopIVs(*dstLoadOpInsts[0], &dstLoopIVs);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1408
				1409	LoopNestStats dstLoopNestStats;
				1410	LoopNestStatsCollector dstStatsCollector(&dstLoopNestStats);
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	1411	dstStatsCollector.collect(dstLoopIVs[0]->getInstruction());
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1412	// Currently only constant trip count loop nests are supported.
				1413	if (dstStatsCollector.hasLoopWithNonConstTripCount)
				1414	return false;
				1415
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1416	// Compute the maximum loop depth at which we can can insert the src slice
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1417	// and still satisfy dest loop nest dependences, for producer-consumer fusion.
				1418	unsigned maxDstLoopDepth =
				1419	(srcOpInst == srcStoreOpInst)
				1420	? getMaxLoopDepth(dstLoadOpInsts, dstStoreOpInsts)
				1421	: dstLoopIVs.size();
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1422	if (maxDstLoopDepth == 0)
				1423	return false;
				1424
				1425	// Search for min cost value for 'dstLoopDepth'. At each value of
				1426	// 'dstLoopDepth' from 'maxDstLoopDepth' to '1', compute computation slice
				1427	// bounds between 'srcOpInst' and each op in 'dstOpinsts' (taking the union
				1428	// of these bounds). Next the union slice bounds are used to calculate
				1429	// the cost of the slice and the cost of the slice inserted into the dst
				1430	// loop nest at 'dstLoopDepth'.
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1431	uint64_t minFusedLoopNestComputeCost = std::numeric_limits<uint64_t>::max();
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1432	double maxStorageReduction = 0.0;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1433	Optional<uint64_t> sliceMemEstimate = None;
				1434
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1435	SmallVector<ComputationSliceState, 4> sliceStates;
				1436	sliceStates.resize(maxDstLoopDepth);
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1437	// The best loop depth at which to materialize the slice.
				1438	Optional<unsigned> bestDstLoopDepth = None;
				1439
				1440	// Compute op instance count for the src loop nest without iteration slicing.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1441	uint64_t srcLoopNestCost =
				1442	getComputeCost(srcLoopIVs[0]->getInstruction(), &srcLoopNestStats,
				1443	/tripCountOverrideMap=/nullptr,
				1444	/computeCostMap=/nullptr);
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1445
MLIR Team	b9dde91	2019-02-06 19:01:10	[diff] [blame]	1446	// Compute src loop nest write region size.
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1447	MemRefRegion srcWriteRegion(srcStoreOpInst->getLoc());
				1448	srcWriteRegion.compute(srcStoreOpInst, /loopDepth=/0);
MLIR Team	b9dde91	2019-02-06 19:01:10	[diff] [blame]	1449	Optional<int64_t> maybeSrcWriteRegionSizeBytes =
				1450	srcWriteRegion.getRegionSize();
				1451	if (!maybeSrcWriteRegionSizeBytes.hasValue())
				1452	return false;
				1453	int64_t srcWriteRegionSizeBytes = maybeSrcWriteRegionSizeBytes.getValue();
				1454
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1455	// Compute op instance count for the src loop nest.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1456	uint64_t dstLoopNestCost =
				1457	getComputeCost(dstLoopIVs[0]->getInstruction(), &dstLoopNestStats,
				1458	/tripCountOverrideMap=/nullptr,
				1459	/computeCostMap=/nullptr);
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1460
MLIR Team	b9dde91	2019-02-06 19:01:10	[diff] [blame]	1461	// Evaluate all depth choices for materializing the slice in the destination
				1462	// loop nest.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1463	llvm::SmallDenseMap<Instruction *, uint64_t, 8> sliceTripCountMap;
				1464	DenseMap<Instruction *, int64_t> computeCostMap;
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1465	for (unsigned i = maxDstLoopDepth; i >= 1; --i) {
				1466	MemRefAccess srcAccess(srcOpInst);
				1467	// Handle the common case of one dst load without a copy.
				1468	if (!mlir::getBackwardComputationSliceState(
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1469	srcAccess, MemRefAccess(dstLoadOpInsts[0]), i, &sliceStates[i - 1]))
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1470	return false;
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1471
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1472	// Compute the union of slice bound of all ops in 'dstLoadOpInsts'.
				1473	for (int j = 1, e = dstLoadOpInsts.size(); j < e; ++j) {
				1474	MemRefAccess dstAccess(dstLoadOpInsts[j]);
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1475	ComputationSliceState tmpSliceState;
				1476	if (!mlir::getBackwardComputationSliceState(srcAccess, dstAccess, i,
				1477	&tmpSliceState))
				1478	return false;
				1479	// Compute slice boun dunion of 'tmpSliceState' and 'sliceStates[i - 1]'.
Uday Bondhugula	c1ca23e	2019-01-16 21:13:00	[diff] [blame]	1480	getSliceUnion(tmpSliceState, &sliceStates[i - 1]);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1481	}
Uday Bondhugula	b4a1443	2019-01-26 00:00:50	[diff] [blame]	1482	// Build trip count map for computation slice. We'll skip cases where the
				1483	// trip count was non-constant.
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1484	sliceTripCountMap.clear();
				1485	if (!buildSliceTripCountMap(srcOpInst, &sliceStates[i - 1],
				1486	&sliceTripCountMap))
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1487	continue;
				1488
				1489	// Checks whether a store to load forwarding will happen.
				1490	int64_t sliceIterationCount = getSliceIterationCount(sliceTripCountMap);
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1491	assert(sliceIterationCount > 0);
Uday Bondhugula	b4a1443	2019-01-26 00:00:50	[diff] [blame]	1492	bool storeLoadFwdGuaranteed = (sliceIterationCount == 1);
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1493
				1494	// Compute cost of fusion for this dest loop depth.
				1495
				1496	computeCostMap.clear();
				1497
				1498	// The store and loads to this memref will disappear.
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1499	// TODO(andydavis) Add load coalescing to memref data flow opt pass.
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1500	if (storeLoadFwdGuaranteed) {
				1501	// A single store disappears: -1 for that.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1502	computeCostMap[srcLoopIVs[numSrcLoopIVs - 1]->getInstruction()] = -1;
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1503	for (auto *loadOp : dstLoadOpInsts) {
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1504	auto *parentInst = loadOp->getParentInst();
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1505	if (parentInst && parentInst->isa<AffineForOp>())
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1506	computeCostMap[parentInst] = -1;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1507	}
				1508	}
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1509
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1510	// Compute op instance count for the src loop nest with iteration slicing.
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1511	int64_t sliceComputeCost =
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1512	getComputeCost(srcLoopIVs[0]->getInstruction(), &srcLoopNestStats,
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1513	/tripCountOverrideMap=/&sliceTripCountMap,
				1514	/computeCostMap=/&computeCostMap);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1515
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1516	// Compute cost of fusion for this depth.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1517	computeCostMap[dstLoopIVs[i - 1]->getInstruction()] = sliceComputeCost;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1518
				1519	int64_t fusedLoopNestComputeCost =
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1520	getComputeCost(dstLoopIVs[0]->getInstruction(), &dstLoopNestStats,
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1521	/tripCountOverrideMap=/nullptr, &computeCostMap);
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1522
				1523	double additionalComputeFraction =
				1524	fusedLoopNestComputeCost /
				1525	(static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
				1526	1;
				1527
MLIR Team	b9dde91	2019-02-06 19:01:10	[diff] [blame]	1528	// Compute what the slice write MemRefRegion would be, if the src loop
				1529	// nest slice 'sliceStates[i - 1]' were to be inserted into the dst loop
				1530	// nest at loop depth 'i'
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1531	MemRefRegion sliceWriteRegion(srcStoreOpInst->getLoc());
				1532	sliceWriteRegion.compute(srcStoreOpInst, /loopDepth=/0,
				1533	&sliceStates[i - 1]);
MLIR Team	b9dde91	2019-02-06 19:01:10	[diff] [blame]	1534	Optional<int64_t> maybeSliceWriteRegionSizeBytes =
				1535	sliceWriteRegion.getRegionSize();
				1536	if (!maybeSliceWriteRegionSizeBytes.hasValue() \|\|
				1537	maybeSliceWriteRegionSizeBytes.getValue() == 0)
				1538	continue;
				1539	int64_t sliceWriteRegionSizeBytes =
				1540	maybeSliceWriteRegionSizeBytes.getValue();
				1541
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1542	// If we are fusing for reuse, check that write regions remain the same.
				1543	// TODO(andydavis) Write region check should check sizes and offsets in
				1544	// each dimension, so that we are sure they are covering the same memref
				1545	// region. Also, move this out to a isMemRefRegionSuperSet helper function.
				1546	if (srcOpInst != srcStoreOpInst &&
				1547	sliceWriteRegionSizeBytes != srcWriteRegionSizeBytes)
				1548	continue;
				1549
MLIR Team	b9dde91	2019-02-06 19:01:10	[diff] [blame]	1550	double storageReduction = static_cast<double>(srcWriteRegionSizeBytes) /
				1551	static_cast<double>(sliceWriteRegionSizeBytes);
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1552
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1553	LLVM_DEBUG({
				1554	std::stringstream msg;
				1555	msg << " evaluating fusion profitability at depth : " << i << "\n"
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	1556	<< std::fixed << std::setprecision(2)
				1557	<< " additional compute fraction: "
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1558	<< 100.0 * additionalComputeFraction << "%\n"
				1559	<< " storage reduction factor: " << storageReduction << "x\n"
				1560	<< " fused nest cost: " << fusedLoopNestComputeCost << "\n"
Uday Bondhugula	a1dad3a	2019-02-20 02:17:19	[diff] [blame]	1561	<< " slice iteration count: " << sliceIterationCount << "\n"
				1562	<< " src write region size: " << srcWriteRegionSizeBytes << "\n"
				1563	<< " slice write region size: " << sliceWriteRegionSizeBytes
				1564	<< "\n";
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1565	llvm::dbgs() << msg.str();
				1566	});
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1567
				1568	double computeToleranceThreshold =
				1569	clFusionAddlComputeTolerance.getNumOccurrences() > 0
				1570	? clFusionAddlComputeTolerance
				1571	: LoopFusion::kComputeToleranceThreshold;
				1572
				1573	// TODO(b/123247369): This is a placeholder cost model.
				1574	// Among all choices that add an acceptable amount of redundant computation
				1575	// (as per computeToleranceThreshold), we will simply pick the one that
				1576	// reduces the intermediary size the most.
				1577	if ((storageReduction > maxStorageReduction) &&
				1578	(clMaximalLoopFusion \|\|
				1579	(additionalComputeFraction < computeToleranceThreshold))) {
				1580	maxStorageReduction = storageReduction;
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1581	bestDstLoopDepth = i;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1582	minFusedLoopNestComputeCost = fusedLoopNestComputeCost;
MLIR Team	b9dde91	2019-02-06 19:01:10	[diff] [blame]	1583	sliceMemEstimate = sliceWriteRegionSizeBytes;
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1584	}
				1585	}
				1586
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1587	// A simple cost model: fuse if it reduces the memory footprint. If
				1588	// -maximal-fusion is set, fuse nevertheless.
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1589
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1590	if (!clMaximalLoopFusion && !bestDstLoopDepth.hasValue()) {
Uday Bondhugula	a1dad3a	2019-02-20 02:17:19	[diff] [blame]	1591	LLVM_DEBUG(
				1592	llvm::dbgs()
				1593	<< "All fusion choices involve more than the threshold amount of "
				1594	"redundant computation; NOT fusing.\n");
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1595	return false;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1596	}
				1597
				1598	assert(bestDstLoopDepth.hasValue() &&
				1599	"expected to have a value per logic above");
				1600
				1601	// Set dstLoopDepth based on best values from search.
				1602	*dstLoopDepth = bestDstLoopDepth.getValue();
				1603
				1604	LLVM_DEBUG(
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1605	llvm::dbgs() << " LoopFusion fusion stats:"
				1606	<< "\n best loop depth: " << bestDstLoopDepth
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1607	<< "\n src loop nest compute cost: " << srcLoopNestCost
				1608	<< "\n dst loop nest compute cost: " << dstLoopNestCost
				1609	<< "\n fused loop nest compute cost: "
				1610	<< minFusedLoopNestComputeCost << "\n");
				1611
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1612	auto dstMemSize = getMemoryFootprintBytes(dstLoopIVs[0]);
				1613	auto srcMemSize = getMemoryFootprintBytes(srcLoopIVs[0]);
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1614
				1615	Optional<double> storageReduction = None;
				1616
				1617	if (!clMaximalLoopFusion) {
				1618	if (!dstMemSize.hasValue() \|\| !srcMemSize.hasValue()) {
				1619	LLVM_DEBUG(
				1620	llvm::dbgs()
				1621	<< " fusion memory benefit cannot be evaluated; NOT fusing.\n");
				1622	return false;
				1623	}
				1624
				1625	auto srcMemSizeVal = srcMemSize.getValue();
				1626	auto dstMemSizeVal = dstMemSize.getValue();
				1627
				1628	assert(sliceMemEstimate.hasValue() && "expected value");
				1629	// This is an inaccurate estimate since sliceMemEstimate is isaccurate.
				1630	auto fusedMem = dstMemSizeVal + sliceMemEstimate.getValue();
				1631
				1632	LLVM_DEBUG(llvm::dbgs() << " src mem: " << srcMemSizeVal << "\n"
				1633	<< " dst mem: " << dstMemSizeVal << "\n"
				1634	<< " fused mem: " << fusedMem << "\n"
				1635	<< " slice mem: " << sliceMemEstimate << "\n");
				1636
				1637	if (fusedMem > srcMemSizeVal + dstMemSizeVal) {
				1638	LLVM_DEBUG(llvm::dbgs() << "Fusion is not profitable; NOT fusing.\n");
				1639	return false;
				1640	}
				1641	storageReduction =
				1642	100.0 *
				1643	(1.0 - fusedMem / (static_cast<double>(srcMemSizeVal) + dstMemSizeVal));
				1644	}
				1645
				1646	double additionalComputeFraction =
				1647	100.0 * (minFusedLoopNestComputeCost /
				1648	(static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
				1649	1);
MLIR Team	5c5739d	2019-01-25 06:27:40	[diff] [blame]	1650	(void)additionalComputeFraction;
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1651	LLVM_DEBUG({
				1652	std::stringstream msg;
				1653	msg << " fusion is most profitable at depth " << *dstLoopDepth << " with "
MLIR Team	8564b27	2019-02-22 15:48:59	[diff] [blame]	1654	<< std::setprecision(2) << additionalComputeFraction
Uday Bondhugula	06d21d9	2019-01-25 01:01:49	[diff] [blame]	1655	<< "% redundant computation and a ";
				1656	msg << (storageReduction.hasValue()
				1657	? std::to_string(storageReduction.getValue())
				1658	: "<unknown>");
				1659	msg << "% storage reduction.\n";
				1660	llvm::dbgs() << msg.str();
				1661	});
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1662
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1663	// Update return parameter 'sliceState' with 'bestSliceState'.
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1664	ComputationSliceState bestSliceState = &sliceStates[dstLoopDepth - 1];
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1665	sliceState->lbs = bestSliceState->lbs;
				1666	sliceState->ubs = bestSliceState->ubs;
				1667	sliceState->lbOperands = bestSliceState->lbOperands;
				1668	sliceState->ubOperands = bestSliceState->ubOperands;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1669
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1670	// Canonicalize slice bound affine maps.
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1671	for (unsigned i = 0; i < numSrcLoopIVs; ++i) {
Nicolas Vasilache	0e7a8a9	2019-01-26 18:41:17	[diff] [blame]	1672	if (sliceState->lbs[i] != AffineMap()) {
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1673	canonicalizeMapAndOperands(&sliceState->lbs[i],
				1674	&sliceState->lbOperands[i]);
				1675	}
Nicolas Vasilache	0e7a8a9	2019-01-26 18:41:17	[diff] [blame]	1676	if (sliceState->ubs[i] != AffineMap()) {
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1677	canonicalizeMapAndOperands(&sliceState->ubs[i],
				1678	&sliceState->ubOperands[i]);
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1679	}
				1680	}
				1681	return true;
				1682	}
				1683
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1684	// GreedyFusion greedily fuses loop nests which have a producer/consumer or
				1685	// input-reuse relationship on a memref, with the goal of improving locality.
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	1686	//
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1687	// The steps of the producer-consumer fusion algorithm are as follows:
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1688	//
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1689	// *) A worklist is initialized with node ids from the dependence graph.
				1690	// *) For each node id in the worklist:
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1691	// *) Pop a AffineForOp of the worklist. This 'dstAffineForOp' will be a
				1692	// candidate destination AffineForOp into which fusion will be attempted.
				1693	// *) Add each LoadOp currently in 'dstAffineForOp' into list 'dstLoadOps'.
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1694	// *) For each LoadOp in 'dstLoadOps' do:
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1695	// *) Lookup dependent loop nests which have a single store op to the same
				1696	// memref.
				1697	// *) Check if dependences would be violated by the fusion.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1698	// *) Get a computation slice of 'srcLoopNest', which adjusts its loop
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1699	// bounds to be functions of 'dstLoopNest' IVs and symbols.
				1700	// *) Fuse the 'srcLoopNest' computation slice into the 'dstLoopNest',
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1701	// at a loop depth determined by the cost model in 'isFusionProfitable'.
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	1702	// *) Add the newly fused load/store operation instructions to the state,
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1703	// and also add newly fuse load ops to 'dstLoopOps' to be considered
				1704	// as fusion dst load ops in another iteration.
				1705	// *) Remove old src loop nest and its associated state.
				1706	//
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1707	// The steps of the input-reuse fusion algorithm are as follows:
				1708	//
				1709	// *) Initialize 'worklist' with node ids from the dependence graph.
				1710	// *) For each 'dstNode' in the worklist:
				1711	// *) Find a candidate sibling node 'sibNode' to fuse with 'dstNode' which
				1712	// loads from the same memref, but which has no dependence paths to/from.
				1713	// *) Get a computation slice of 'sibLoopNest', which adjusts its loop
				1714	// bounds to be functions of 'dstLoopNest' IVs and symbols.
				1715	// *) Fuse the 'sibLoopNest' computation slice into the 'dstLoopNest',
				1716	// at a loop depth determined by the cost model in 'isFusionProfitable'.
				1717	// This function also checks that the memref write region of 'sibLoopNest',
				1718	// is preserved in the fused loop nest.
				1719	// *) Update graph state to reflect the fusion of 'sibNode' into 'dstNode'.
				1720	//
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	1721	// Given a graph where top-level instructions are vertices in the set 'V' and
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1722	// edges in the set 'E' are dependences between vertices, this algorithm
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1723	// takes O(V) time for initialization, and has runtime O(V + E).
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1724	//
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1725	// This greedy algorithm is not 'maximal' due to the current restriction of
				1726	// fusing along single producer consumer edges, but there is a TODO to fix this.
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1727	//
				1728	// TODO(andydavis) Experiment with other fusion policies.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1729	struct GreedyFusion {
				1730	public:
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1731	// The data dependence graph to traverse during fusion.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1732	MemRefDependenceGraph *mdg;
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1733	// Worklist of graph nodes visited during the fusion pass.
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	1734	SmallVector<unsigned, 8> worklist;
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1735	// Set of graph nodes which are present on the worklist.
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	1736	llvm::SmallDenseSet<unsigned, 16> worklistSet;
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1737	// Parameter for local buffer size threshold.
				1738	unsigned localBufSizeThreshold;
				1739	// Parameter for fast memory space.
				1740	Optional<unsigned> fastMemorySpace;
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	1741
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1742	using Node = MemRefDependenceGraph::Node;
				1743
				1744	GreedyFusion(MemRefDependenceGraph *mdg, unsigned localBufSizeThreshold,
				1745	Optional<unsigned> fastMemorySpace)
				1746	: mdg(mdg), localBufSizeThreshold(localBufSizeThreshold),
				1747	fastMemorySpace(fastMemorySpace) {}
				1748
				1749	// Initializes 'worklist' with nodes from 'mdg'
				1750	void init() {
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	1751	// TODO(andydavis) Add a priority queue for prioritizing nodes by different
				1752	// metrics (e.g. arithmetic intensity/flops-to-bytes ratio).
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1753	worklist.clear();
				1754	worklistSet.clear();
				1755	for (auto &idAndNode : mdg->nodes) {
				1756	const Node &node = idAndNode.second;
				1757	worklist.push_back(node.id);
				1758	worklistSet.insert(node.id);
				1759	}
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1760	}
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1761
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1762	// Run the GreedyFusion pass.
				1763	// *) First pass through the nodes fuses single-use producer nodes into their
				1764	// unique consumer.
				1765	// *) Second pass fuses sibling nodes which share no dependence edges.
				1766	// *) Third pass fuses any remaining producer nodes into their users.
				1767	void run() {
				1768	fuseProducerConsumerNodes(/maxSrcUserCount=/1);
				1769	fuseSiblingNodes();
				1770	fuseProducerConsumerNodes(
				1771	/maxSrcUserCount=/std::numeric_limits<unsigned>::max());
				1772	eraseUnusedMemRefAllocations();
				1773	}
				1774
				1775	void fuseProducerConsumerNodes(unsigned maxSrcUserCount) {
				1776	init();
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1777	while (!worklist.empty()) {
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1778	unsigned dstId = worklist.back();
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1779	worklist.pop_back();
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	1780	worklistSet.erase(dstId);
				1781
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1782	// Skip if this node was removed (fused into another node).
				1783	if (mdg->nodes.count(dstId) == 0)
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1784	continue;
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1785	// Get 'dstNode' into which to attempt fusion.
				1786	auto *dstNode = mdg->getNode(dstId);
				1787	// Skip if 'dstNode' is not a loop nest.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1788	if (!dstNode->inst->isa<AffineForOp>())
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1789	continue;
MLIR Team	8f5f2c7	2019-02-15 17:32:18	[diff] [blame]	1790	// Sink sequential loops in 'dstNode' (and thus raise parallel loops)
				1791	// while preserving relative order. This can increase the maximum loop
				1792	// depth at which we can fuse a slice of a producer loop nest into a
				1793	// consumer loop nest.
				1794	sinkSequentialLoops(dstNode);
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1795
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1796	SmallVector<Instruction *, 4> loads = dstNode->loads;
				1797	SmallVector<Instruction *, 4> dstLoadOpInsts;
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1798	DenseSet<Value *> visitedMemrefs;
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1799	while (!loads.empty()) {
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1800	// Get memref of load on top of the stack.
				1801	auto *memref = loads.back()->cast<LoadOp>()->getMemRef();
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1802	if (visitedMemrefs.count(memref) > 0)
				1803	continue;
				1804	visitedMemrefs.insert(memref);
MLIR Team	27d067e	2019-01-16 17:55:02	[diff] [blame]	1805	// Move all loads in 'loads' accessing 'memref' to 'dstLoadOpInsts'.
				1806	moveLoadsAccessingMemrefTo(memref, &loads, &dstLoadOpInsts);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1807	// Skip if no input edges along which to fuse.
				1808	if (mdg->inEdges.count(dstId) == 0)
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1809	continue;
MLIR Team	1e85191	2019-01-31 00:01:46	[diff] [blame]	1810	// Iterate through in edges for 'dstId' and src node id for any
				1811	// edges on 'memref'.
				1812	SmallVector<unsigned, 2> srcNodeIds;
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1813	for (auto &srcEdge : mdg->inEdges[dstId]) {
				1814	// Skip 'srcEdge' if not for 'memref'.
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1815	if (srcEdge.value != memref)
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1816	continue;
MLIR Team	1e85191	2019-01-31 00:01:46	[diff] [blame]	1817	srcNodeIds.push_back(srcEdge.id);
				1818	}
				1819	for (unsigned srcId : srcNodeIds) {
				1820	// Skip if this node was removed (fused into another node).
				1821	if (mdg->nodes.count(srcId) == 0)
				1822	continue;
				1823	// Get 'srcNode' from which to attempt fusion into 'dstNode'.
				1824	auto *srcNode = mdg->getNode(srcId);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1825	// Skip if 'srcNode' is not a loop nest.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1826	if (!srcNode->inst->isa<AffineForOp>())
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1827	continue;
MLIR Team	b28009b	2019-01-23 19:11:43	[diff] [blame]	1828	// Skip if 'srcNode' has more than one store to any memref.
				1829	// TODO(andydavis) Support fusing multi-output src loop nests.
				1830	if (srcNode->stores.size() != 1)
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1831	continue;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1832
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1833	// Skip 'srcNode' if it has in edges on 'memref'.
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1834	// TODO(andydavis) Track dependence type with edges, and just check
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1835	// for WAW dependence edge here. Note that this check is overly
				1836	// conservative and will be removed in the future.
				1837	if (mdg->getIncomingMemRefAccesses(srcNode->id, memref) != 0)
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1838	continue;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1839
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	1840	// Skip if 'srcNode' writes to any live in or escaping memrefs,
				1841	// and cannot be fused.
				1842	bool writesToLiveInOrOut =
				1843	mdg->writesToLiveInOrEscapingMemrefs(srcNode->id);
				1844	if (writesToLiveInOrOut &&
				1845	!canFuseSrcWhichWritesToLiveOut(srcId, dstId, memref, mdg))
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1846	continue;
				1847
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1848	// Skip if 'srcNode' out edge count on 'memref' > 'maxSrcUserCount'.
				1849	if (mdg->getOutEdgeCount(srcNode->id, memref) > maxSrcUserCount)
				1850	continue;
				1851
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1852	// Compute an instruction list insertion point for the fused loop
				1853	// nest which preserves dependences.
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	1854	Instruction *insertPointInst =
				1855	mdg->getFusedLoopNestInsertionPoint(srcNode->id, dstNode->id);
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1856	if (insertPointInst == nullptr)
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1857	continue;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1858
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1859	// Get unique 'srcNode' store op.
Chris Lattner	456ad6a	2018-12-29 00:05:35	[diff] [blame]	1860	auto *srcStoreOpInst = srcNode->stores.front();
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1861	// Gather 'dstNode' store ops to 'memref'.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1862	SmallVector<Instruction *, 2> dstStoreOpInsts;
MLIR Team	d7c8244	2019-01-30 23:53:41	[diff] [blame]	1863	for (auto *storeOpInst : dstNode->stores)
				1864	if (storeOpInst->cast<StoreOp>()->getMemRef() == memref)
				1865	dstStoreOpInsts.push_back(storeOpInst);
				1866
Uday Bondhugula	b4a1443	2019-01-26 00:00:50	[diff] [blame]	1867	unsigned bestDstLoopDepth;
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1868	mlir::ComputationSliceState sliceState;
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1869	// Check if fusion would be profitable.
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1870	if (!isFusionProfitable(srcStoreOpInst, srcStoreOpInst,
				1871	dstLoadOpInsts, dstStoreOpInsts, &sliceState,
Uday Bondhugula	b4a1443	2019-01-26 00:00:50	[diff] [blame]	1872	&bestDstLoopDepth))
MLIR Team	38c2fe3	2019-01-14 19:26:25	[diff] [blame]	1873	continue;
Uday Bondhugula	864d9e0	2019-01-23 17:16:24	[diff] [blame]	1874
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1875	// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1876	auto sliceLoopNest = mlir::insertBackwardComputationSlice(
Uday Bondhugula	b4a1443	2019-01-26 00:00:50	[diff] [blame]	1877	srcStoreOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1878	if (sliceLoopNest != nullptr) {
Uday Bondhugula	a1dad3a	2019-02-20 02:17:19	[diff] [blame]	1879	LLVM_DEBUG(llvm::dbgs()
				1880	<< "\tslice loop nest:\n"
				1881	<< *sliceLoopNest->getInstruction() << "\n");
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1882	// Move 'dstAffineForOp' before 'insertPointInst' if needed.
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	1883	auto dstAffineForOp = dstNode->inst->cast<AffineForOp>();
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1884	if (insertPointInst != dstAffineForOp->getInstruction()) {
				1885	dstAffineForOp->getInstruction()->moveBefore(insertPointInst);
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1886	}
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1887	// Update edges between 'srcNode' and 'dstNode'.
MLIR Team	a0f3db40	2019-01-29 17:36:41	[diff] [blame]	1888	mdg->updateEdges(srcNode->id, dstNode->id, memref);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1889
				1890	// Collect slice loop stats.
				1891	LoopNestStateCollector sliceCollector;
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	1892	sliceCollector.collect(sliceLoopNest->getInstruction());
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1893	// Promote single iteration slice loops to single IV value.
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1894	for (auto forOp : sliceCollector.forOps) {
				1895	promoteIfSingleIteration(forOp);
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	1896	}
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	1897	if (!writesToLiveInOrOut) {
				1898	// Create private memref for 'memref' in 'dstAffineForOp'.
				1899	SmallVector<Instruction *, 4> storesForMemref;
				1900	for (auto *storeOpInst : sliceCollector.storeOpInsts) {
				1901	if (storeOpInst->cast<StoreOp>()->getMemRef() == memref)
				1902	storesForMemref.push_back(storeOpInst);
				1903	}
				1904	assert(storesForMemref.size() == 1);
				1905	auto *newMemRef = createPrivateMemRef(
				1906	dstAffineForOp, storesForMemref[0], bestDstLoopDepth,
				1907	fastMemorySpace, localBufSizeThreshold);
				1908	visitedMemrefs.insert(newMemRef);
				1909	// Create new node in dependence graph for 'newMemRef' alloc op.
				1910	unsigned newMemRefNodeId =
				1911	mdg->addNode(newMemRef->getDefiningInst());
				1912	// Add edge from 'newMemRef' node to dstNode.
				1913	mdg->addEdge(newMemRefNodeId, dstId, newMemRef);
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1914	}
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1915
				1916	// Collect dst loop stats after memref privatizaton transformation.
				1917	LoopNestStateCollector dstLoopCollector;
River Riddle	bf9c381	2019-02-05 00:24:44	[diff] [blame]	1918	dstLoopCollector.collect(dstAffineForOp->getInstruction());
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1919
				1920	// Add new load ops to current Node load op list 'loads' to
				1921	// continue fusing based on new operands.
				1922	for (auto *loadOpInst : dstLoopCollector.loadOpInsts) {
				1923	auto *loadMemRef = loadOpInst->cast<LoadOp>()->getMemRef();
				1924	if (visitedMemrefs.count(loadMemRef) == 0)
				1925	loads.push_back(loadOpInst);
				1926	}
				1927
				1928	// Clear and add back loads and stores
				1929	mdg->clearNodeLoadAndStores(dstNode->id);
				1930	mdg->addToNode(dstId, dstLoopCollector.loadOpInsts,
				1931	dstLoopCollector.storeOpInsts);
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	1932	// Remove old src loop nest if it no longer has outgoing dependence
				1933	// edges, and it does not write to a memref which escapes the
MLIR Team	58aa383	2019-02-16 01:12:19	[diff] [blame]	1934	// function. If 'writesToLiveInOrOut' is true, then 'srcNode' has
				1935	// been fused into 'dstNode' and write region of 'dstNode' covers
				1936	// the write region of 'srcNode', and 'srcNode' has no other users
				1937	// so it is safe to remove.
				1938	if (writesToLiveInOrOut \|\| mdg->canRemoveNode(srcNode->id)) {
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1939	mdg->removeNode(srcNode->id);
River Riddle	5052bd8	2019-02-02 00:42:18	[diff] [blame]	1940	srcNode->inst->erase();
MLIR Team	a78edcd	2019-02-05 14:57:08	[diff] [blame]	1941	} else {
				1942	// Add remaining users of 'oldMemRef' back on the worklist (if not
				1943	// already there), as its replacement with a local/private memref
				1944	// has reduced dependences on 'oldMemRef' which may have created
				1945	// new fusion opportunities.
				1946	if (mdg->outEdges.count(srcNode->id) > 0) {
				1947	SmallVector<MemRefDependenceGraph::Edge, 2> oldOutEdges =
				1948	mdg->outEdges[srcNode->id];
				1949	for (auto &outEdge : oldOutEdges) {
				1950	if (outEdge.value == memref &&
				1951	worklistSet.count(outEdge.id) == 0) {
				1952	worklist.push_back(outEdge.id);
				1953	worklistSet.insert(outEdge.id);
				1954	}
				1955	}
				1956	}
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	1957	}
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1958	}
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	1959	}
				1960	}
				1961	}
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	1962	}
				1963
				1964	// Visits each node in the graph, and for each node, attempts to fuse it with
				1965	// its sibling nodes (nodes which share a parent, but no dependence edges).
				1966	void fuseSiblingNodes() {
				1967	init();
				1968	while (!worklist.empty()) {
				1969	unsigned dstId = worklist.back();
				1970	worklist.pop_back();
				1971	worklistSet.erase(dstId);
				1972
				1973	// Skip if this node was removed (fused into another node).
				1974	if (mdg->nodes.count(dstId) == 0)
				1975	continue;
				1976	// Get 'dstNode' into which to attempt fusion.
				1977	auto *dstNode = mdg->getNode(dstId);
				1978	// Skip if 'dstNode' is not a loop nest.
				1979	if (!dstNode->inst->isa<AffineForOp>())
				1980	continue;
				1981	// Attempt to fuse 'dstNode' with its sibling nodes in the graph.
				1982	fuseWithSiblingNodes(dstNode);
				1983	}
				1984	}
				1985
				1986	// Attempt to fuse 'dstNode' with sibling nodes in the graph.
				1987	void fuseWithSiblingNodes(Node *dstNode) {
				1988	DenseSet<unsigned> visitedSibNodeIds;
				1989	std::pair<unsigned, Value *> idAndMemref;
				1990	while (findSiblingNodeToFuse(dstNode, &visitedSibNodeIds, &idAndMemref)) {
				1991	unsigned sibId = idAndMemref.first;
				1992	Value *memref = idAndMemref.second;
				1993	// TODO(andydavis) Check that 'sibStoreOpInst' post-dominates all other
				1994	// stores to the same memref in 'sibNode' loop nest.
				1995	auto *sibNode = mdg->getNode(sibId);
				1996	// Compute an instruction list insertion point for the fused loop
				1997	// nest which preserves dependences.
				1998	assert(sibNode->inst->getBlock() == dstNode->inst->getBlock());
				1999	Instruction *insertPointInst =
				2000	sibNode->inst->isBeforeInBlock(dstNode->inst)
				2001	? mdg->getFusedLoopNestInsertionPoint(sibNode->id, dstNode->id)
				2002	: mdg->getFusedLoopNestInsertionPoint(dstNode->id, sibNode->id);
				2003	if (insertPointInst == nullptr)
				2004	continue;
				2005
				2006	// Check if fusion would be profitable and at what depth.
				2007
				2008	// Get unique 'sibNode' load op to 'memref'.
				2009	SmallVector<Instruction *, 2> sibLoadOpInsts;
				2010	sibNode->getLoadOpsForMemref(memref, &sibLoadOpInsts);
				2011	// Currently findSiblingNodeToFuse searches for siblings with one load.
				2012	assert(sibLoadOpInsts.size() == 1);
				2013	Instruction *sibLoadOpInst = sibLoadOpInsts[0];
				2014	assert(!sibNode->stores.empty());
				2015	// TODO(andydavis) Choose the store which postdominates all other stores.
				2016	auto *sibStoreOpInst = sibNode->stores.back();
				2017
				2018	// Gather 'dstNode' load ops to 'memref'.
				2019	SmallVector<Instruction *, 2> dstLoadOpInsts;
				2020	dstNode->getLoadOpsForMemref(memref, &dstLoadOpInsts);
				2021
				2022	// Gather 'dstNode' store ops to 'memref'.
				2023	SmallVector<Instruction *, 2> dstStoreOpInsts;
				2024	dstNode->getStoreOpsForMemref(memref, &dstStoreOpInsts);
				2025
				2026	unsigned bestDstLoopDepth;
				2027	mlir::ComputationSliceState sliceState;
				2028
				2029	// Check if fusion would be profitable.
				2030	if (!isFusionProfitable(sibLoadOpInst, sibStoreOpInst, dstLoadOpInsts,
				2031	dstStoreOpInsts, &sliceState, &bestDstLoopDepth))
				2032	continue;
				2033
				2034	// Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'.
				2035	auto sliceLoopNest = mlir::insertBackwardComputationSlice(
				2036	sibLoadOpInst, dstLoadOpInsts[0], bestDstLoopDepth, &sliceState);
				2037	if (sliceLoopNest != nullptr) {
				2038	auto dstForInst = dstNode->inst->cast<AffineForOp>();
				2039	// Update instruction position of fused loop nest (if needed).
				2040	if (insertPointInst != dstForInst->getInstruction()) {
				2041	dstForInst->getInstruction()->moveBefore(insertPointInst);
				2042	}
				2043	// Update data dependence graph state post fusion.
				2044	updateStateAfterSiblingFusion(sliceLoopNest, sibNode, dstNode);
				2045	}
				2046	}
				2047	}
				2048
				2049	// Searches the graph from 'dstNode' looking for a fusion candidate sibling
				2050	// node which shares no dependences with 'dstNode' but which loads from the
				2051	// same memref. Returns true and sets 'idAndMemrefToFuse' on success. Returns
				2052	// false otherwise.
				2053	bool findSiblingNodeToFuse(Node *dstNode,
				2054	DenseSet<unsigned> *visitedSibNodeIds,
				2055	std::pair<unsigned, Value > idAndMemrefToFuse) {
				2056	// TODO(andydavis) Currently we discover siblings by following edges
				2057	// through an intermediate src node. We should also consider siblings
				2058	// which load from the same memref, but which do not necessarily share
				2059	// a src node parent (e.g. loading from a memref which is a function arg).
				2060	// Collect candidate 'dstNode' input edges in 'inEdges'.
				2061	SmallVector<MemRefDependenceGraph::Edge, 2> inEdges;
				2062	mdg->forEachMemRefInputEdge(
				2063	dstNode->id, [&](MemRefDependenceGraph::Edge inEdge) {
				2064	// Add 'inEdge' if it is a read-after-write dependence.
				2065	if (dstNode->getLoadOpCount(inEdge.value) > 0 &&
				2066	mdg->getNode(inEdge.id)->getStoreOpCount(inEdge.value) > 0)
				2067	inEdges.push_back(inEdge);
				2068	});
				2069
				2070	// Search for sibling nodes to fuse by visiting output edges from each input
				2071	// edge in 'inEdges'.
				2072	for (auto &inEdge : inEdges) {
				2073	// Collect candidate output edges from each node 'inEdge.id' in 'inEdges'.
				2074	SmallVector<MemRefDependenceGraph::Edge, 2> outEdges;
				2075	mdg->forEachMemRefOutputEdge(
				2076	inEdge.id, [&](MemRefDependenceGraph::Edge outEdge) {
				2077	unsigned sibNodeId = outEdge.id;
				2078	if (visitedSibNodeIds->count(sibNodeId) > 0)
				2079	return;
				2080	// Skip output edge if not a sibling using the same memref.
				2081	if (outEdge.id == dstNode->id \|\| outEdge.value != inEdge.value)
				2082	return;
				2083	auto *sibNode = mdg->getNode(sibNodeId);
				2084	if (!sibNode->inst->isa<AffineForOp>())
				2085	return;
				2086	// Skip if 'outEdge' is not a read-after-write dependence.
				2087	// TODO(andydavis) Remove restrict to single load op restriction.
				2088	if (sibNode->getLoadOpCount(inEdge.value) != 1)
				2089	return;
				2090	// Skip if there exists a path of dependent edges between
				2091	// 'sibNode' and 'dstNode'.
				2092	if (mdg->hasDependencePath(sibNodeId, dstNode->id) \|\|
				2093	mdg->hasDependencePath(dstNode->id, sibNodeId))
				2094	return;
				2095	// Skip sib node if it loads to (and stores from) the same memref on
				2096	// which it also has an input dependence edge.
				2097	DenseSet<Value *> loadAndStoreMemrefSet;
				2098	sibNode->getLoadAndStoreMemrefSet(&loadAndStoreMemrefSet);
				2099	if (llvm::any_of(loadAndStoreMemrefSet, [=](Value *memref) {
				2100	return mdg->getIncomingMemRefAccesses(sibNode->id, memref) >
				2101	0;
				2102	}))
				2103	return;
				2104	// Check that all stores are to the same memref.
				2105	DenseSet<Value *> storeMemrefs;
				2106	for (auto *storeOpInst : sibNode->stores) {
				2107	storeMemrefs.insert(storeOpInst->cast<StoreOp>()->getMemRef());
				2108	}
				2109	if (storeMemrefs.size() != 1)
				2110	return;
				2111	// Add candidate 'outEdge' to sibling node.
				2112	outEdges.push_back(outEdge);
				2113	});
				2114
				2115	// Add first candidate if any were returned.
				2116	if (!outEdges.empty()) {
				2117	visitedSibNodeIds->insert(outEdges[0].id);
				2118	idAndMemrefToFuse->first = outEdges[0].id;
				2119	idAndMemrefToFuse->second = outEdges[0].value;
				2120	return true;
				2121	}
				2122	}
				2123	return false;
				2124	}
				2125
				2126	void updateStateAfterSiblingFusion(OpPointer<AffineForOp> sliceLoopNest,
				2127	Node sibNode, Node dstNode) {
				2128	// Update 'sibNode' and 'dstNode' input/output edges to reflect fusion.
				2129	mdg->updateEdges(sibNode->id, dstNode->id);
				2130
				2131	// Collect slice loop stats.
				2132	LoopNestStateCollector sliceCollector;
				2133	sliceCollector.collect(sliceLoopNest->getInstruction());
				2134	// Promote single iteration slice loops to single IV value.
				2135	for (auto forOp : sliceCollector.forOps) {
				2136	promoteIfSingleIteration(forOp);
				2137	}
				2138
				2139	// Collect dst loop stats after memref privatizaton transformation.
				2140	auto dstForInst = dstNode->inst->cast<AffineForOp>();
				2141	LoopNestStateCollector dstLoopCollector;
				2142	dstLoopCollector.collect(dstForInst->getInstruction());
				2143	// Clear and add back loads and stores
				2144	mdg->clearNodeLoadAndStores(dstNode->id);
				2145	mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts,
				2146	dstLoopCollector.storeOpInsts);
				2147	// Remove old sibling loop nest if it no longer has outgoing dependence
				2148	// edges, and it does not write to a memref which escapes the
				2149	// function.
				2150	if (mdg->getOutEdgeCount(sibNode->id) == 0) {
				2151	mdg->removeNode(sibNode->id);
				2152	sibNode->inst->cast<AffineForOp>()->erase();
				2153	}
				2154	}
				2155
				2156	// Clean up any allocs with no users.
				2157	void eraseUnusedMemRefAllocations() {
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	2158	for (auto &pair : mdg->memrefEdgeCount) {
				2159	if (pair.second > 0)
				2160	continue;
				2161	auto *memref = pair.first;
MLIR Team	71495d5	2019-01-22 21:23:37	[diff] [blame]	2162	// Skip if there exist other uses (return instruction or function calls).
				2163	if (!memref->use_empty())
				2164	continue;
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	2165	// Use list expected to match the dep graph info.
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	2166	auto *inst = memref->getDefiningInst();
River Riddle	b499277	2019-02-04 18:38:47	[diff] [blame]	2167	if (inst && inst->isa<AllocOp>())
				2168	inst->erase();
MLIR Team	c4237ae	2019-01-18 16:56:27	[diff] [blame]	2169	}
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	2170	}
MLIR Team	3b69230	2018-12-17 17:57:14	[diff] [blame]	2171	};
				2172
				2173	} // end anonymous namespace
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	2174
River Riddle	ed5fe20	2019-02-28 22:50:42	[diff] [blame]	2175	void LoopFusion::runOnFunction() {
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	2176	// Override if a command line argument was provided.
Uday Bondhugula	8be2627	2019-02-02 01:06:22	[diff] [blame]	2177	if (clFusionFastMemorySpace.getNumOccurrences() > 0) {
				2178	fastMemorySpace = clFusionFastMemorySpace.getValue();
				2179	}
				2180
Uday Bondhugula	d4b3ff1	2019-02-27 00:10:19	[diff] [blame]	2181	// Override if a command line argument was provided.
				2182	if (clFusionLocalBufThreshold.getNumOccurrences() > 0) {
				2183	localBufSizeThreshold = clFusionLocalBufThreshold * 1024;
				2184	}
				2185
MLIR Team	6892ffb	2018-12-20 04:42:55	[diff] [blame]	2186	MemRefDependenceGraph g;
River Riddle	c6c5344	2019-02-27 18:59:29	[diff] [blame]	2187	if (g.init(&getFunction()))
MLIR Team	d038e34	2019-03-01 19:50:25	[diff] [blame]	2188	GreedyFusion(&g, localBufSizeThreshold, fastMemorySpace).run();
MLIR Team	f28e4df	2018-11-01 14:26:00	[diff] [blame]	2189	}
Jacques Pienaar	6f0fb22	2018-11-07 02:34:18	[diff] [blame]	2190
				2191	static PassRegistration<LoopFusion> pass("loop-fusion", "Fuse loop nests");