[BOLT] Increase coverage of shrink wrapping [1/5]
Change how function score is calculated and provide more
detailed statistics when reporting back frame optimizer and shrink
wrapping results. In this new statistics, we provide dynamic coverage
numbers. The main metric for shrink wrapping is the number of executed
stores that were saved because of shrink wrapping (push instructions
that were either entirely moved away from the hot block or converted
to a stack adjustment instruction). There is still a number of reduced
load instructions (pop) that we are not counting at the moment. Also
update alloc combiner to report dynamic numbers, as well as frame
optimizer.
For debugging purposes, we also include a list of top 10 functions
optimized by shrink wrapping. These changes are aimed at better
understanding the impact of shrink wrapping in a given binary.
We also remove an assertion in dataflow analysis to do not choke on
empty functions (which makes no sense).
Reviewed By: Amir
Differential Revision: https://reviews.llvm.org/D126111
diff --git a/bolt/include/bolt/Passes/AllocCombiner.h b/bolt/include/bolt/Passes/AllocCombiner.h
index 21f1aa7..44c6fdd 100644
--- a/bolt/include/bolt/Passes/AllocCombiner.h
+++ b/bolt/include/bolt/Passes/AllocCombiner.h
@@ -17,6 +17,7 @@
class AllocCombinerPass : public BinaryFunctionPass {
/// Stats aggregating variables
uint64_t NumCombined{0};
+ uint64_t DynamicCountCombined{0};
DenseSet<const BinaryFunction *> FuncsChanged;
void combineAdjustments(BinaryFunction &BF);
diff --git a/bolt/include/bolt/Passes/DataflowAnalysis.h b/bolt/include/bolt/Passes/DataflowAnalysis.h
index 0b68cc2..1605b3f 100644
--- a/bolt/include/bolt/Passes/DataflowAnalysis.h
+++ b/bolt/include/bolt/Passes/DataflowAnalysis.h
@@ -315,6 +315,8 @@
void run() {
derived().preflight();
+ if (Func.begin() == Func.end())
+ return;
// Initialize state for all points of the function
for (BinaryBasicBlock &BB : Func) {
StateTy &St = getOrCreateStateAt(BB);
@@ -324,7 +326,6 @@
St = derived().getStartingStateAtPoint(Inst);
}
}
- assert(Func.begin() != Func.end() && "Unexpected empty function");
std::queue<BinaryBasicBlock *> Worklist;
// TODO: Pushing this in a DFS ordering will greatly speed up the dataflow
diff --git a/bolt/include/bolt/Passes/FrameAnalysis.h b/bolt/include/bolt/Passes/FrameAnalysis.h
index ec44694..4cc758d 100644
--- a/bolt/include/bolt/Passes/FrameAnalysis.h
+++ b/bolt/include/bolt/Passes/FrameAnalysis.h
@@ -130,7 +130,6 @@
/// Analysis stats counters
uint64_t NumFunctionsNotOptimized{0};
uint64_t NumFunctionsFailedRestoreFI{0};
- uint64_t CountFunctionsNotOptimized{0};
uint64_t CountFunctionsFailedRestoreFI{0};
uint64_t CountDenominator{0};
diff --git a/bolt/include/bolt/Passes/FrameOptimizer.h b/bolt/include/bolt/Passes/FrameOptimizer.h
index 8cce91d..310bebf 100644
--- a/bolt/include/bolt/Passes/FrameOptimizer.h
+++ b/bolt/include/bolt/Passes/FrameOptimizer.h
@@ -77,9 +77,12 @@
/// Stats aggregating variables
uint64_t NumRedundantLoads{0};
uint64_t NumRedundantStores{0};
- uint64_t NumLoadsChangedToReg{0};
- uint64_t NumLoadsChangedToImm{0};
+ uint64_t FreqRedundantLoads{0};
+ uint64_t FreqRedundantStores{0};
+ uint64_t FreqLoadsChangedToReg{0};
+ uint64_t FreqLoadsChangedToImm{0};
uint64_t NumLoadsDeleted{0};
+ uint64_t FreqLoadsDeleted{0};
DenseSet<const BinaryFunction *> FuncsChanged;
diff --git a/bolt/include/bolt/Passes/ShrinkWrapping.h b/bolt/include/bolt/Passes/ShrinkWrapping.h
index e8bd093..457c9e5 100644
--- a/bolt/include/bolt/Passes/ShrinkWrapping.h
+++ b/bolt/include/bolt/Passes/ShrinkWrapping.h
@@ -310,6 +310,10 @@
/// Pass stats
static std::atomic_uint64_t SpillsMovedRegularMode;
static std::atomic_uint64_t SpillsMovedPushPopMode;
+ static std::atomic_uint64_t SpillsMovedDynamicCount;
+ static std::atomic_uint64_t SpillsFailedDynamicCount;
+ static std::atomic_uint64_t InstrDynamicCount;
+ static std::atomic_uint64_t StoreDynamicCount;
Optional<unsigned> AnnotationIndex;
@@ -515,7 +519,7 @@
BC.MIB->removeAnnotation(Inst, getAnnotationIndex());
}
- bool perform();
+ bool perform(bool HotOnly = false);
static void printStats();
};
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 46a86d6..9b95182 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -2299,7 +2299,7 @@
uint64_t BBExecCount = BB->getExecutionCount();
if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
continue;
- TotalScore += BBExecCount;
+ TotalScore += BBExecCount * BB->getNumNonPseudos();
}
FunctionScore = TotalScore;
return FunctionScore;
diff --git a/bolt/lib/Passes/AllocCombiner.cpp b/bolt/lib/Passes/AllocCombiner.cpp
index e92b74e..0dcf65f 100644
--- a/bolt/lib/Passes/AllocCombiner.cpp
+++ b/bolt/lib/Passes/AllocCombiner.cpp
@@ -101,6 +101,7 @@
BB.eraseInstruction(BB.findInstruction(Prev));
++NumCombined;
+ DynamicCountCombined += BB.getKnownExecutionCount();
FuncsChanged.insert(&BF);
Prev = &Inst;
}
@@ -116,7 +117,8 @@
});
outs() << "BOLT-INFO: Allocation combiner: " << NumCombined
- << " empty spaces coalesced.\n";
+ << " empty spaces coalesced (dyn count: " << DynamicCountCombined
+ << ").\n";
}
} // end namespace bolt
diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp
index afd8344..80fd47e 100644
--- a/bolt/lib/Passes/FrameAnalysis.cpp
+++ b/bolt/lib/Passes/FrameAnalysis.cpp
@@ -527,16 +527,12 @@
}
for (auto &I : BC.getBinaryFunctions()) {
- uint64_t Count = I.second.getExecutionCount();
- if (Count != BinaryFunction::COUNT_NO_PROFILE)
- CountDenominator += Count;
+ CountDenominator += I.second.getFunctionScore();
// "shouldOptimize" for passes that run after finalize
if (!(I.second.isSimple() && I.second.hasCFG() && !I.second.isIgnored()) ||
!opts::shouldFrameOptimize(I.second)) {
++NumFunctionsNotOptimized;
- if (Count != BinaryFunction::COUNT_NO_PROFILE)
- CountFunctionsNotOptimized += Count;
continue;
}
@@ -545,9 +541,7 @@
"FA breakdown", opts::TimeFA);
if (!restoreFrameIndex(I.second)) {
++NumFunctionsFailedRestoreFI;
- uint64_t Count = I.second.getExecutionCount();
- if (Count != BinaryFunction::COUNT_NO_PROFILE)
- CountFunctionsFailedRestoreFI += Count;
+ CountFunctionsFailedRestoreFI += I.second.getFunctionScore();
continue;
}
}
@@ -568,10 +562,7 @@
void FrameAnalysis::printStats() {
outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized
- << " function(s) "
- << format("(%.1lf%% dyn cov)",
- (100.0 * CountFunctionsNotOptimized / CountDenominator))
- << " were not optimized.\n"
+ << " function(s) were not optimized.\n"
<< "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI
<< " function(s) "
<< format("(%.1lf%% dyn cov)",
diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp
index 8422d09..8e07834 100644
--- a/bolt/lib/Passes/FrameOptimizer.cpp
+++ b/bolt/lib/Passes/FrameOptimizer.cpp
@@ -108,6 +108,7 @@
continue;
++NumRedundantLoads;
+ FreqRedundantLoads += BB.getKnownExecutionCount();
Changed = true;
LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
LLVM_DEBUG(Inst.dump());
@@ -120,11 +121,12 @@
LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
break;
}
- ++NumLoadsChangedToReg;
+ FreqLoadsChangedToReg += BB.getKnownExecutionCount();
MIB->removeAnnotation(Inst, "FrameAccessEntry");
LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
if (MIB->isRedundantMove(Inst)) {
++NumLoadsDeleted;
+ FreqLoadsDeleted += BB.getKnownExecutionCount();
LLVM_DEBUG(dbgs() << "Created a redundant move\n");
// Delete it!
ToErase.push_front(std::make_pair(&BB, &Inst));
@@ -136,7 +138,7 @@
if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
LLVM_DEBUG(dbgs() << "FAILED\n");
} else {
- ++NumLoadsChangedToImm;
+ FreqLoadsChangedToImm += BB.getKnownExecutionCount();
MIB->removeAnnotation(Inst, "FrameAccessEntry");
LLVM_DEBUG(dbgs() << "Ok\n");
}
@@ -199,6 +201,7 @@
continue;
++NumRedundantStores;
+ FreqRedundantStores += BB.getKnownExecutionCount();
Changed = true;
LLVM_DEBUG(dbgs() << "Unused store instruction: ");
LLVM_DEBUG(Inst.dump());
@@ -286,11 +289,16 @@
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
<< " redundant load(s) and " << NumRedundantStores
<< " unused store(s)\n";
- outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg
- << " load(s) to use a register instead of a stack access, and "
- << NumLoadsChangedToImm << " to use an immediate.\n"
- << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
- << NumRedundantStores << " store(s).\n";
+ outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
+ << " and frequency of unused stores is " << FreqRedundantStores
+ << "\n";
+ outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
+ << FreqLoadsChangedToReg
+ << " and frequency of loads changed to use an immediate is "
+ << FreqLoadsChangedToImm << "\n";
+ outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
+ << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
+ << NumRedundantStores << " store(s)\n";
FA->printStats();
ShrinkWrapping::printStats();
}
@@ -323,34 +331,51 @@
BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
+ std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
+ auto LogFunc = [&](BinaryFunction &BF) {
+ auto Lower = std::lower_bound(
+ Top10Funcs.begin(), Top10Funcs.end(), BF.getKnownExecutionCount(),
+ [](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
+ uint64_t Value) { return Elmt.first > Value; });
+ if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
+ return;
+ Top10Funcs.insert(Lower,
+ std::make_pair<>(BF.getKnownExecutionCount(), &BF));
+ if (Top10Funcs.size() > 10)
+ Top10Funcs.resize(10);
+ };
+ (void)LogFunc;
+
ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
- if (!FA.hasFrameInfo(BF))
- return true;
-
- if (opts::FrameOptimization == FOP_HOT &&
- (BF.getKnownExecutionCount() < BC.getHotThreshold()))
- return true;
-
- if (BF.getKnownExecutionCount() == 0)
+ if (BF.getFunctionScore() == 0)
return true;
return false;
};
+ const bool HotOnly = opts::FrameOptimization == FOP_HOT;
+
ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
[&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
ShrinkWrapping SW(FA, BF, Info, AllocatorId);
- if (SW.perform()) {
+ if (SW.perform(HotOnly)) {
std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
FuncsChanged.insert(&BF);
+ LLVM_DEBUG(LogFunc(BF));
}
};
ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
SkipPredicate, "shrink-wrapping");
+
+ if (!Top10Funcs.empty()) {
+ outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
+ for (const auto &Elmt : Top10Funcs)
+ outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
+ }
}
} // namespace bolt
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index a0c868c..d58c6b6 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -712,6 +712,10 @@
std::atomic_uint64_t ShrinkWrapping::SpillsMovedRegularMode{0};
std::atomic_uint64_t ShrinkWrapping::SpillsMovedPushPopMode{0};
+std::atomic_uint64_t ShrinkWrapping::SpillsMovedDynamicCount{0};
+std::atomic_uint64_t ShrinkWrapping::SpillsFailedDynamicCount{0};
+std::atomic_uint64_t ShrinkWrapping::InstrDynamicCount{0};
+std::atomic_uint64_t ShrinkWrapping::StoreDynamicCount{0};
using BBIterTy = BinaryBasicBlock::iterator;
@@ -1273,16 +1277,19 @@
// Keeps info about successfully moved regs: reg index, save position and
// save size
std::vector<std::tuple<unsigned, MCInst *, size_t>> MovedRegs;
+ uint64_t TotalEstimatedWin = 0;
for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) {
MCInst *BestPosSave = nullptr;
- uint64_t TotalEstimatedWin = 0;
- if (!isBestSavePosCold(I, BestPosSave, TotalEstimatedWin))
+ uint64_t EstimatedWin = 0;
+ if (!isBestSavePosCold(I, BestPosSave, EstimatedWin))
continue;
SmallVector<ProgramPoint, 4> RestorePoints =
- doRestorePlacement(BestPosSave, I, TotalEstimatedWin);
- if (RestorePoints.empty())
+ doRestorePlacement(BestPosSave, I, EstimatedWin);
+ if (RestorePoints.empty()) {
+ SpillsFailedDynamicCount += EstimatedWin;
continue;
+ }
const FrameIndexEntry *FIESave = CSA.SaveFIEByReg[I];
const FrameIndexEntry *FIELoad = CSA.LoadFIEByReg[I];
@@ -1295,8 +1302,10 @@
// If we don't know stack state at this point, bail
if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) &&
- (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY))
+ (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY)) {
+ SpillsFailedDynamicCount += EstimatedWin;
continue;
+ }
// Operation mode: if true, will insert push/pops instead of loads/restores
bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset);
@@ -1319,6 +1328,7 @@
scheduleOldSaveRestoresRemoval(I, UsePushPops);
scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops);
MovedRegs.emplace_back(std::make_tuple(I, BestPosSave, SaveSize));
+ TotalEstimatedWin += EstimatedWin;
}
// Revert push-pop mode if it failed for a single CSR
@@ -1348,6 +1358,7 @@
}
}
}
+ SpillsMovedDynamicCount += TotalEstimatedWin;
// Update statistics
if (!UsedPushPopMode) {
@@ -1941,12 +1952,36 @@
}
}
-bool ShrinkWrapping::perform() {
+bool ShrinkWrapping::perform(bool HotOnly) {
HasDeletedOffsetCFIs = BitVector(BC.MRI->getNumRegs(), false);
PushOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
PopOffsetByReg = std::vector<int64_t>(BC.MRI->getNumRegs(), 0LL);
DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);
+ // Update pass statistics
+ uint64_t TotalInstrs = 0ULL;
+ uint64_t TotalStoreInstrs = 0ULL;
+ for (BinaryBasicBlock *BB : BF.layout()) {
+ uint64_t BBExecCount = BB->getExecutionCount();
+ if (!BBExecCount || BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
+ continue;
+ for (const auto &Instr : *BB) {
+ if (BC.MIB->isPseudo(Instr))
+ continue;
+ if (BC.MIB->isStore(Instr))
+ TotalStoreInstrs += BBExecCount;
+ TotalInstrs += BBExecCount;
+ }
+ }
+ InstrDynamicCount += TotalInstrs;
+ StoreDynamicCount += TotalStoreInstrs;
+
+ if (!FA.hasFrameInfo(BF))
+ return false;
+
+ if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold()))
+ return false;
+
if (BF.checkForAmbiguousJumpTables()) {
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
<< ".\n");
@@ -1993,6 +2028,24 @@
outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode
<< " spills inserting load/stores and " << SpillsMovedPushPopMode
<< " spills inserting push/pops\n";
+ if (!InstrDynamicCount || !StoreDynamicCount)
+ return;
+ outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount
+ << " store executions ("
+ << format("%.1lf%%",
+ (100.0 * SpillsMovedDynamicCount / InstrDynamicCount))
+ << " total instructions executed, "
+ << format("%.1lf%%",
+ (100.0 * SpillsMovedDynamicCount / StoreDynamicCount))
+ << " store instructions)\n";
+ outs() << "BOLT-INFO: Shrink wrapping failed at reducing "
+ << SpillsFailedDynamicCount << " store executions ("
+ << format("%.1lf%%",
+ (100.0 * SpillsFailedDynamicCount / InstrDynamicCount))
+ << " total instructions executed, "
+ << format("%.1lf%%",
+ (100.0 * SpillsFailedDynamicCount / StoreDynamicCount))
+ << " store instructions)\n";
}
// Operators necessary as a result of using MCAnnotation
diff --git a/bolt/test/X86/shrinkwrapping-pop-order.s b/bolt/test/X86/shrinkwrapping-pop-order.s
index 5045ba2..2a5db36 100644
--- a/bolt/test/X86/shrinkwrapping-pop-order.s
+++ b/bolt/test/X86/shrinkwrapping-pop-order.s
@@ -34,6 +34,8 @@
# Check shrink wrapping results:
# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 2 spills inserting push/pops
+# CHECK: BOLT-INFO: Shrink wrapping reduced 6 store executions (28.6% total instructions executed, 100.0% store instructions)
+# CHECK: BOLT-INFO: Shrink wrapping failed at reducing 0 store executions (0.0% total instructions executed, 0.0% store instructions)
# Check that order is correct
# CHECK: Binary Function "_start" after frame-optimizer