| //===--- BinaryPasses.cpp - Binary-level analysis/optimization passes -----===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "BinaryPasses.h" |
| #include "Passes/ReorderAlgorithm.h" |
| #include "llvm/Support/Options.h" |
| #include <numeric> |
| |
| #define DEBUG_TYPE "bolt" |
| |
| using namespace llvm; |
| |
| namespace { |
| |
| const char* dynoStatsOptName(const bolt::DynoStats::Category C) { |
| if (C == bolt::DynoStats::FIRST_DYNO_STAT) |
| return "none"; |
| else if (C == bolt::DynoStats::LAST_DYNO_STAT) |
| return "all"; |
| |
| static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT+1]; |
| |
| OptNames[C] = bolt::DynoStats::Description(C); |
| |
| std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); |
| |
| return OptNames[C].c_str(); |
| } |
| |
| const char* dynoStatsOptDesc(const bolt::DynoStats::Category C) { |
| if (C == bolt::DynoStats::FIRST_DYNO_STAT) |
| return "unsorted"; |
| else if (C == bolt::DynoStats::LAST_DYNO_STAT) |
| return "sorted by all stats"; |
| |
| return bolt::DynoStats::Description(C); |
| } |
| |
| } |
| |
| namespace opts { |
| |
| extern cl::OptionCategory BoltCategory; |
| extern cl::OptionCategory BoltOptCategory; |
| |
| extern cl::opt<unsigned> Verbosity; |
| extern cl::opt<bolt::BinaryFunction::SplittingType> SplitFunctions; |
| extern bool shouldProcess(const bolt::BinaryFunction &Function); |
| |
| enum DynoStatsSortOrder : char { |
| Ascending, |
| Descending |
| }; |
| |
| static cl::opt<bool> |
| AggressiveSplitting("split-all-cold", |
| cl::desc("outline as many cold basic blocks as possible"), |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<DynoStatsSortOrder> |
| DynoStatsSortOrderOpt("print-sorted-by-order", |
| cl::desc("use ascending or descending order when printing functions " |
| "ordered by dyno stats"), |
| cl::ZeroOrMore, |
| cl::init(DynoStatsSortOrder::Descending), |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool> |
| ICFUseDFS("icf-dfs", |
| cl::desc("use DFS ordering when using -icf option"), |
| cl::ReallyHidden, |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool> |
| MinBranchClusters("min-branch-clusters", |
| cl::desc("use a modified clustering algorithm geared towards minimizing " |
| "branches"), |
| cl::ZeroOrMore, |
| cl::Hidden, |
| cl::cat(BoltOptCategory)); |
| |
| enum PeepholeOpts : char { |
| PEEP_NONE = 0x0, |
| PEEP_SHORTEN = 0x1, |
| PEEP_DOUBLE_JUMPS = 0x2, |
| PEEP_TAILCALL_TRAPS = 0x4, |
| PEEP_USELESS_BRANCHES = 0x8, |
| PEEP_ALL = 0xf |
| }; |
| |
| static cl::list<PeepholeOpts> |
| Peepholes("peepholes", |
| cl::CommaSeparated, |
| cl::desc("enable peephole optimizations"), |
| cl::value_desc("opt1,opt2,opt3,..."), |
| cl::values( |
| clEnumValN(PEEP_NONE, "none", "disable peepholes"), |
| clEnumValN(PEEP_SHORTEN, "shorten", "perform instruction shortening"), |
| clEnumValN(PEEP_DOUBLE_JUMPS, "double-jumps", |
| "remove double jumps when able"), |
| clEnumValN(PEEP_TAILCALL_TRAPS, "tailcall-traps", "insert tail call traps"), |
| clEnumValN(PEEP_USELESS_BRANCHES, "useless-branches", |
| "remove useless conditional branches"), |
| clEnumValN(PEEP_ALL, "all", "enable all peephole optimizations")), |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> |
| PrintFuncStat("print-function-statistics", |
| cl::desc("print statistics about basic block ordering"), |
| cl::init(0), |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::list<bolt::DynoStats::Category> |
| PrintSortedBy("print-sorted-by", |
| cl::CommaSeparated, |
| cl::desc("print functions sorted by order of dyno stats"), |
| cl::value_desc("key1,key2,key3,..."), |
| cl::values( |
| #define D(name, ...) \ |
| clEnumValN(bolt::DynoStats::name, \ |
| dynoStatsOptName(bolt::DynoStats::name), \ |
| dynoStatsOptDesc(bolt::DynoStats::name)), |
| DYNO_STATS |
| #undef D |
| clEnumValN(0xffff, ".", ".") |
| ), |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bolt::ReorderBasicBlocks::LayoutType> |
| ReorderBlocks("reorder-blocks", |
| cl::desc("change layout of basic blocks in a function"), |
| cl::init(bolt::ReorderBasicBlocks::LT_NONE), |
| cl::values( |
| clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, |
| "none", |
| "do not reorder basic blocks"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, |
| "reverse", |
| "layout blocks in reverse order"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, |
| "normal", |
| "perform optimal layout based on profile"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH, |
| "branch-predictor", |
| "perform optimal layout prioritizing branch " |
| "predictions"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, |
| "cache", |
| "perform optimal layout prioritizing I-cache " |
| "behavior"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, |
| "cache+", |
| "perform layout optimizing I-cache behavior"), |
| clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE, |
| "cluster-shuffle", |
| "perform random layout of clusters")), |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool> |
| ReportStaleFuncs("report-stale", |
| cl::desc("print the list of functions with stale profile"), |
| cl::init(false), |
| cl::ZeroOrMore, |
| cl::Hidden, |
| cl::cat(BoltOptCategory)); |
| |
| enum SctcModes : char { |
| SctcAlways, |
| SctcPreserveDirection, |
| SctcHeuristic |
| }; |
| |
| static cl::opt<SctcModes> |
| SctcMode("sctc-mode", |
| cl::desc("mode for simplify conditional tail calls"), |
| cl::init(SctcAlways), |
| cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"), |
| clEnumValN(SctcPreserveDirection, |
| "preserve", |
| "only perform sctc when branch direction is " |
| "preserved"), |
| clEnumValN(SctcHeuristic, |
| "heuristic", |
| "use branch prediction data to control sctc")), |
| cl::ZeroOrMore, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool> |
| SplitEH("split-eh", |
| cl::desc("split C++ exception handling code (experimental)"), |
| cl::ZeroOrMore, |
| cl::Hidden, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> |
| TSPThreshold("tsp-threshold", |
| cl::desc("maximum number of hot basic blocks in a function for which to use " |
| "a precise TSP solution while re-ordering basic blocks"), |
| cl::init(10), |
| cl::ZeroOrMore, |
| cl::Hidden, |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> |
| TopCalledLimit("top-called-limit", |
| cl::desc("maximum number of functions to print in top called " |
| "functions section"), |
| cl::init(100), |
| cl::ZeroOrMore, |
| cl::Hidden, |
| cl::cat(BoltCategory)); |
| |
| } // namespace opts |
| |
| namespace llvm { |
| namespace bolt { |
| |
| bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const { |
| return BF.isSimple() && |
| BF.getState() == BinaryFunction::State::CFG && |
| opts::shouldProcess(BF) && |
| (BF.getSize() > 0); |
| } |
| |
| bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const { |
| return BF.isSimple() && opts::shouldProcess(BF); |
| } |
| |
| void OptimizeBodylessFunctions::analyze( |
| BinaryFunction &BF, |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs) { |
| if (BF.size() != 1 || BF.front().getNumNonPseudos() != 1) |
| return; |
| |
| const auto *FirstInstr = BF.front().getFirstNonPseudoInstr(); |
| if (!FirstInstr) |
| return; |
| if (!BC.MIA->isTailCall(*FirstInstr)) |
| return; |
| const auto *TargetSymbol = BC.MIA->getTargetSymbol(*FirstInstr); |
| if (!TargetSymbol) |
| return; |
| const auto *Function = BC.getFunctionForSymbol(TargetSymbol); |
| if (!Function) |
| return; |
| |
| EquivalentCallTarget[BF.getSymbol()] = Function; |
| } |
| |
| void OptimizeBodylessFunctions::optimizeCalls(BinaryFunction &BF, |
| BinaryContext &BC) { |
| for (auto *BB : BF.layout()) { |
| for (auto &Inst : *BB) { |
| if (!BC.MIA->isCall(Inst)) |
| continue; |
| const auto *OriginalTarget = BC.MIA->getTargetSymbol(Inst); |
| if (!OriginalTarget) |
| continue; |
| const auto *Target = OriginalTarget; |
| // Iteratively update target since we could have f1() calling f2() |
| // calling f3() calling f4() and we want to output f1() directly |
| // calling f4(). |
| unsigned CallSites = 0; |
| while (EquivalentCallTarget.count(Target)) { |
| Target = EquivalentCallTarget.find(Target)->second->getSymbol(); |
| ++CallSites; |
| } |
| if (Target == OriginalTarget) |
| continue; |
| DEBUG(dbgs() << "BOLT-DEBUG: Optimizing " << BB->getName() |
| << " (executed " << BB->getKnownExecutionCount() |
| << " times) in " << BF |
| << ": replacing call to " << OriginalTarget->getName() |
| << " by call to " << Target->getName() |
| << " while folding " << CallSites << " call sites\n"); |
| BC.MIA->replaceBranchTarget(Inst, Target, BC.Ctx.get()); |
| |
| NumOptimizedCallSites += CallSites; |
| if (BB->hasProfile()) { |
| NumEliminatedCalls += CallSites * BB->getExecutionCount(); |
| } |
| } |
| } |
| } |
| |
| void OptimizeBodylessFunctions::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &) { |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| if (shouldOptimize(Function)) { |
| analyze(Function, BC, BFs); |
| } |
| } |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| if (shouldOptimize(Function)) { |
| optimizeCalls(Function, BC); |
| } |
| } |
| |
| if (NumEliminatedCalls || NumOptimizedCallSites) { |
| outs() << "BOLT-INFO: optimized " << NumOptimizedCallSites |
| << " redirect call sites to eliminate " << NumEliminatedCalls |
| << " dynamic calls.\n"; |
| } |
| } |
| |
| void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) { |
| if (Function.layout_size() > 0) { |
| unsigned Count; |
| uint64_t Bytes; |
| Function.markUnreachable(); |
| DEBUG({ |
| for (auto *BB : Function.layout()) { |
| if (!BB->isValid()) { |
| dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName() |
| << " in function " << Function << "\n"; |
| BB->dump(); |
| } |
| } |
| }); |
| std::tie(Count, Bytes) = Function.eraseInvalidBBs(); |
| DeletedBlocks += Count; |
| DeletedBytes += Bytes; |
| if (Count && opts::Verbosity > 0) { |
| Modified.insert(&Function); |
| outs() << "BOLT-INFO: Removed " << Count |
| << " dead basic block(s) accounting for " << Bytes |
| << " bytes in function " << Function << '\n'; |
| } |
| } |
| } |
| |
| void EliminateUnreachableBlocks::runOnFunctions( |
| BinaryContext&, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> & |
| ) { |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| if (shouldOptimize(Function)) { |
| runOnFunction(Function); |
| } |
| } |
| outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " |
| << DeletedBytes << " bytes of code.\n"; |
| } |
| |
| bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const { |
| return (BinaryFunctionPass::shouldPrint(BF) && |
| opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE); |
| } |
| |
| void ReorderBasicBlocks::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &LargeFunctions) { |
| if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE) |
| return; |
| |
| uint64_t ModifiedFuncCount = 0; |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| |
| if (!shouldOptimize(Function)) |
| continue; |
| |
| const bool ShouldSplit = |
| (opts::SplitFunctions == BinaryFunction::ST_ALL) || |
| (opts::SplitFunctions == BinaryFunction::ST_EH && |
| Function.hasEHRanges()) || |
| (LargeFunctions.find(It.first) != LargeFunctions.end()); |
| modifyFunctionLayout(Function, opts::ReorderBlocks, opts::MinBranchClusters, |
| ShouldSplit); |
| |
| if (opts::PrintFuncStat > 0 && Function.hasLayoutChanged()) { |
| ++ModifiedFuncCount; |
| } |
| } |
| |
| if (opts::PrintFuncStat > 0) { |
| raw_ostream &OS = outs(); |
| // Copy all the values into vector in order to sort them |
| std::map<uint64_t, BinaryFunction &> ScoreMap; |
| for (auto It = BFs.begin(); It != BFs.end(); ++It) { |
| ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>( |
| It->second.getFunctionScore(), It->second)); |
| } |
| |
| OS << "\nBOLT-INFO: Printing Function Statistics:\n\n"; |
| OS << " There are " << BFs.size() << " functions in total. \n"; |
| OS << " Number of functions being modified: " << ModifiedFuncCount |
| << "\n"; |
| OS << " User asks for detailed information on top " |
| << opts::PrintFuncStat << " functions. (Ranked by function score)" |
| << "\n\n"; |
| uint64_t I = 0; |
| for (std::map<uint64_t, BinaryFunction &>::reverse_iterator |
| Rit = ScoreMap.rbegin(); |
| Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) { |
| auto &Function = Rit->second; |
| |
| OS << " Information for function of top: " << (I + 1) << ": \n"; |
| OS << " Function Score is: " << Function.getFunctionScore() |
| << "\n"; |
| OS << " There are " << Function.size() |
| << " number of blocks in this function.\n"; |
| OS << " There are " << Function.getInstructionCount() |
| << " number of instructions in this function.\n"; |
| OS << " The edit distance for this function is: " |
| << Function.getEditDistance() << "\n\n"; |
| } |
| } |
| } |
| |
| void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF, |
| LayoutType Type, bool MinBranchClusters, bool Split) const { |
| if (BF.size() == 0 || Type == LT_NONE) |
| return; |
| |
| BinaryFunction::BasicBlockOrderType NewLayout; |
| std::unique_ptr<ReorderAlgorithm> Algo; |
| |
| // Cannot do optimal layout without profile. |
| if (Type != LT_REVERSE && !BF.hasValidProfile()) |
| return; |
| |
| if (Type == LT_REVERSE) { |
| Algo.reset(new ReverseReorderAlgorithm()); |
| } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) { |
| // Work on optimal solution if problem is small enough |
| DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n"); |
| Algo.reset(new OptimalReorderAlgorithm()); |
| } else { |
| DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n"); |
| |
| std::unique_ptr<ClusterAlgorithm> CAlgo; |
| if (MinBranchClusters) |
| CAlgo.reset(new MinBranchGreedyClusterAlgorithm()); |
| else |
| CAlgo.reset(new PHGreedyClusterAlgorithm()); |
| |
| switch(Type) { |
| case LT_OPTIMIZE: |
| Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| case LT_OPTIMIZE_BRANCH: |
| Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| case LT_OPTIMIZE_CACHE: |
| Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| case LT_OPTIMIZE_CACHE_PLUS: |
| Algo.reset(new CachePlusReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| case LT_OPTIMIZE_SHUFFLE: |
| Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo))); |
| break; |
| |
| default: |
| llvm_unreachable("unexpected layout type"); |
| } |
| } |
| |
| Algo->reorderBasicBlocks(BF, NewLayout); |
| |
| BF.updateBasicBlockLayout(NewLayout, /*SavePrevLayout=*/opts::PrintFuncStat); |
| |
| if (Split) |
| splitFunction(BF); |
| } |
| |
| void ReorderBasicBlocks::splitFunction(BinaryFunction &BF) const { |
| if (!BF.size()) |
| return; |
| |
| bool AllCold = true; |
| for (auto *BB : BF.layout()) { |
| auto ExecCount = BB->getExecutionCount(); |
| if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) |
| return; |
| if (ExecCount != 0) |
| AllCold = false; |
| } |
| |
| if (AllCold) |
| return; |
| |
| // Never outline the first basic block. |
| BF.layout_front()->setCanOutline(false); |
| for (auto *BB : BF.layout()) { |
| if (!BB->canOutline()) |
| continue; |
| if (BB->getExecutionCount() != 0) { |
| BB->setCanOutline(false); |
| continue; |
| } |
| if (BF.hasEHRanges() && !opts::SplitEH) { |
| // We cannot move landing pads (or rather entry points for landing |
| // pads). |
| if (BB->isLandingPad()) { |
| BB->setCanOutline(false); |
| continue; |
| } |
| // We cannot move a block that can throw since exception-handling |
| // runtime cannot deal with split functions. However, if we can guarantee |
| // that the block never throws, it is safe to move the block to |
| // decrease the size of the function. |
| for (auto &Instr : *BB) { |
| if (BF.getBinaryContext().MIA->isInvoke(Instr)) { |
| BB->setCanOutline(false); |
| break; |
| } |
| } |
| } |
| } |
| |
| if (opts::AggressiveSplitting) { |
| // All blocks with 0 count that we can move go to the end of the function. |
| // Even if they were natural to cluster formation and were seen in-between |
| // hot basic blocks. |
| std::stable_sort(BF.layout_begin(), BF.layout_end(), |
| [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { |
| return A->canOutline() < B->canOutline(); |
| }); |
| } else if (BF.hasEHRanges() && !opts::SplitEH) { |
| // Typically functions with exception handling have landing pads at the end. |
| // We cannot move beginning of landing pads, but we can move 0-count blocks |
| // comprising landing pads to the end and thus facilitate splitting. |
| auto FirstLP = BF.layout_begin(); |
| while ((*FirstLP)->isLandingPad()) |
| ++FirstLP; |
| |
| std::stable_sort(FirstLP, BF.layout_end(), |
| [&] (BinaryBasicBlock *A, BinaryBasicBlock *B) { |
| return A->canOutline() < B->canOutline(); |
| }); |
| } |
| |
| // Separate hot from cold starting from the bottom. |
| for (auto I = BF.layout_rbegin(), E = BF.layout_rend(); |
| I != E; ++I) { |
| BinaryBasicBlock *BB = *I; |
| if (!BB->canOutline()) |
| break; |
| BB->setIsCold(true); |
| } |
| } |
| |
| void FixupBranches::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &) { |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| if (BC.HasRelocations || shouldOptimize(Function)) { |
| if (BC.TheTriple->getArch() == llvm::Triple::aarch64 && |
| !Function.isSimple()) |
| continue; |
| Function.fixBranches(); |
| } |
| } |
| } |
| |
| void FinalizeFunctions::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> & |
| ) { |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| const auto ShouldOptimize = shouldOptimize(Function); |
| |
| // Always fix functions in relocation mode. |
| if (!BC.HasRelocations && !ShouldOptimize) |
| continue; |
| |
| // Fix the CFI state. |
| if (ShouldOptimize && !Function.fixCFIState()) { |
| if (BC.HasRelocations) { |
| errs() << "BOLT-ERROR: unable to fix CFI state for function " |
| << Function << ". Exiting.\n"; |
| exit(1); |
| } |
| Function.setSimple(false); |
| continue; |
| } |
| |
| Function.setFinalized(); |
| |
| // Update exception handling information. |
| Function.updateEHRanges(); |
| } |
| } |
| |
| void StripAnnotations::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> & |
| ) { |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| |
| for (auto &BB : Function) { |
| for (auto &Inst : BB) { |
| BC.MIA->removeAllAnnotations(Inst); |
| } |
| } |
| } |
| } |
| |
| namespace { |
| |
| // This peephole fixes jump instructions that jump to another basic |
| // block with a single jump instruction, e.g. |
| // |
| // B0: ... |
| // jmp B1 (or jcc B1) |
| // |
| // B1: jmp B2 |
| // |
| // -> |
| // |
| // B0: ... |
| // jmp B2 (or jcc B2) |
| // |
| uint64_t fixDoubleJumps(BinaryContext &BC, |
| BinaryFunction &Function, |
| bool MarkInvalid) { |
| uint64_t NumDoubleJumps = 0; |
| |
| for (auto &BB : Function) { |
| auto checkAndPatch = [&](BinaryBasicBlock *Pred, |
| BinaryBasicBlock *Succ, |
| const MCSymbol *SuccSym) { |
| // Ignore infinite loop jumps or fallthrough tail jumps. |
| if (Pred == Succ || Succ == &BB) |
| return false; |
| |
| if (Succ) { |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| auto Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| if(!Res) { |
| DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n"; |
| Pred->dump()); |
| return false; |
| } |
| Pred->replaceSuccessor(&BB, Succ); |
| |
| // We must patch up any existing branch instructions to match up |
| // with the new successor. |
| auto *Ctx = BC.Ctx.get(); |
| assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) && |
| "Predecessor block has inconsistent number of successors"); |
| if (CondBranch && |
| BC.MIA->getTargetSymbol(*CondBranch) == BB.getLabel()) { |
| BC.MIA->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx); |
| } else if (UncondBranch && |
| BC.MIA->getTargetSymbol(*UncondBranch) == BB.getLabel()) { |
| BC.MIA->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx); |
| } else if (!UncondBranch) { |
| assert(Function.getBasicBlockAfter(Pred, false) != Succ && |
| "Don't add an explicit jump to a fallthrough block."); |
| Pred->addBranchInstruction(Succ); |
| } |
| } else { |
| // Succ will be null in the tail call case. In this case we |
| // need to explicitly add a tail call instruction. |
| auto *Branch = Pred->getLastNonPseudoInstr(); |
| if (Branch && BC.MIA->isUnconditionalBranch(*Branch)) { |
| assert(BC.MIA->getTargetSymbol(*Branch) == BB.getLabel()); |
| Pred->removeSuccessor(&BB); |
| Pred->eraseInstruction(Branch); |
| Pred->addTailCallInstruction(SuccSym); |
| } else { |
| return false; |
| } |
| } |
| |
| ++NumDoubleJumps; |
| DEBUG(dbgs() << "Removed double jump in " << Function << " from " |
| << Pred->getName() << " -> " << BB.getName() << " to " |
| << Pred->getName() << " -> " << SuccSym->getName() |
| << (!Succ ? " (tail)\n" : "\n")); |
| |
| return true; |
| }; |
| |
| if (BB.getNumNonPseudos() != 1 || BB.isLandingPad()) |
| continue; |
| |
| auto *Inst = BB.getFirstNonPseudoInstr(); |
| const bool IsTailCall = BC.MIA->isTailCall(*Inst); |
| |
| if (!BC.MIA->isUnconditionalBranch(*Inst) && !IsTailCall) |
| continue; |
| |
| // If we operate after SCTC make sure it's not a conditional tail call. |
| if (IsTailCall && BC.MIA->isConditionalBranch(*Inst)) |
| continue; |
| |
| const auto *SuccSym = BC.MIA->getTargetSymbol(*Inst); |
| auto *Succ = BB.getSuccessor(); |
| |
| if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym)) |
| continue; |
| |
| std::vector<BinaryBasicBlock *> Preds{BB.pred_begin(), BB.pred_end()}; |
| |
| for (auto *Pred : Preds) { |
| if (Pred->isLandingPad()) |
| continue; |
| |
| if (Pred->getSuccessor() == &BB || |
| (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) || |
| Pred->getConditionalSuccessor(false) == &BB) { |
| if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid) { |
| BB.markValid(BB.pred_size() != 0 || |
| BB.isLandingPad() || |
| BB.isEntryPoint()); |
| } |
| assert(Function.validateCFG()); |
| } |
| } |
| } |
| |
| return NumDoubleJumps; |
| } |
| |
| } |
| |
| bool SimplifyConditionalTailCalls::shouldRewriteBranch( |
| const BinaryBasicBlock *PredBB, |
| const MCInst &CondBranch, |
| const BinaryBasicBlock *BB, |
| const bool DirectionFlag |
| ) { |
| if (BeenOptimized.count(PredBB)) |
| return false; |
| |
| const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB); |
| |
| if (IsForward) |
| ++NumOrigForwardBranches; |
| else |
| ++NumOrigBackwardBranches; |
| |
| if (opts::SctcMode == opts::SctcAlways) |
| return true; |
| |
| if (opts::SctcMode == opts::SctcPreserveDirection) |
| return IsForward == DirectionFlag; |
| |
| const auto Frequency = PredBB->getBranchStats(BB); |
| |
| // It's ok to rewrite the conditional branch if the new target will be |
| // a backward branch. |
| |
| // If no data available for these branches, then it should be ok to |
| // do the optimization since it will reduce code size. |
| if (Frequency.getError()) |
| return true; |
| |
| // TODO: should this use misprediction frequency instead? |
| const bool Result = |
| (IsForward && Frequency.get().first >= 0.5) || |
| (!IsForward && Frequency.get().first <= 0.5); |
| |
| return Result == DirectionFlag; |
| } |
| |
| uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, |
| BinaryFunction &BF) { |
| // Need updated indices to correctly detect branch' direction. |
| BF.updateLayoutIndices(); |
| BF.markUnreachable(); |
| |
| auto &MIA = BC.MIA; |
| uint64_t NumLocalCTCCandidates = 0; |
| uint64_t NumLocalCTCs = 0; |
| uint64_t LocalCTCTakenCount = 0; |
| uint64_t LocalCTCExecCount = 0; |
| std::vector<std::pair<BinaryBasicBlock *, |
| const BinaryBasicBlock *>> NeedsUncondBranch; |
| |
| // Will block be deleted by UCE? |
| auto isValid = [](const BinaryBasicBlock *BB) { |
| return (BB->pred_size() != 0 || |
| BB->isLandingPad() || |
| BB->isEntryPoint()); |
| }; |
| |
| for (auto *BB : BF.layout()) { |
| // Locate BB with a single direct tail-call instruction. |
| if (BB->getNumNonPseudos() != 1) |
| continue; |
| |
| auto *Instr = BB->getFirstNonPseudoInstr(); |
| if (!MIA->isTailCall(*Instr) || BC.MIA->isConditionalBranch(*Instr)) |
| continue; |
| |
| auto *CalleeSymbol = MIA->getTargetSymbol(*Instr); |
| if (!CalleeSymbol) |
| continue; |
| |
| // Detect direction of the possible conditional tail call. |
| const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol); |
| |
| // Iterate through all predecessors. |
| for (auto *PredBB : BB->predecessors()) { |
| auto *CondSucc = PredBB->getConditionalSuccessor(true); |
| if (!CondSucc) |
| continue; |
| |
| ++NumLocalCTCCandidates; |
| |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| auto Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| |
| // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz |
| if (!Result) { |
| DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n"; |
| PredBB->dump()); |
| continue; |
| } |
| |
| assert(Result && "internal error analyzing conditional branch"); |
| assert(CondBranch && "conditional branch expected"); |
| |
| // It's possible that PredBB is also a successor to BB that may have |
| // been processed by a previous iteration of the SCTC loop, in which |
| // case it may have been marked invalid. We should skip rewriting in |
| // this case. |
| if (!PredBB->isValid()) { |
| assert(PredBB->isSuccessor(BB) && |
| "PredBB should be valid if it is not a successor to BB"); |
| continue; |
| } |
| |
| // We don't want to reverse direction of the branch in new order |
| // without further profile analysis. |
| const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC; |
| if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag)) |
| continue; |
| |
| // Record this block so that we don't try to optimize it twice. |
| BeenOptimized.insert(PredBB); |
| |
| bool BranchForStats; |
| if (CondSucc != BB) { |
| // Patch the new target address into the conditional branch. |
| MIA->reverseBranchCondition(*CondBranch, CalleeSymbol, BC.Ctx.get()); |
| // Since we reversed the condition on the branch we need to change |
| // the target for the unconditional branch or add a unconditional |
| // branch to the old target. This has to be done manually since |
| // fixupBranches is not called after SCTC. |
| NeedsUncondBranch.emplace_back(std::make_pair(PredBB, CondSucc)); |
| BranchForStats = false; |
| } else { |
| // Change destination of the conditional branch. |
| MIA->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get()); |
| BranchForStats = true; |
| } |
| const auto Count = PredBB->getBranchInfo(BranchForStats).Count; |
| const uint64_t CTCTakenFreq = |
| Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count; |
| |
| // Annotate it, so "isCall" returns true for this jcc |
| MIA->setConditionalTailCall(*CondBranch); |
| // Add info abount the conditional tail call frequency, otherwise this |
| // info will be lost when we delete the associated BranchInfo entry |
| BC.MIA->removeAnnotation(*CondBranch, "CTCTakenCount"); |
| BC.MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "CTCTakenCount", |
| CTCTakenFreq); |
| |
| // Remove the unused successor which may be eliminated later |
| // if there are no other users. |
| PredBB->removeSuccessor(BB); |
| // Update BB execution count |
| if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount()) { |
| BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq); |
| } else if (CTCTakenFreq > BB->getKnownExecutionCount()) { |
| BB->setExecutionCount(0); |
| } |
| |
| ++NumLocalCTCs; |
| LocalCTCTakenCount += CTCTakenFreq; |
| LocalCTCExecCount += PredBB->getKnownExecutionCount(); |
| } |
| |
| // Remove the block from CFG if all predecessors were removed. |
| BB->markValid(isValid(BB)); |
| } |
| |
| // Add unconditional branches at the end of BBs to new successors |
| // as long as the successor is not a fallthrough. |
| for (auto &Entry : NeedsUncondBranch) { |
| auto *PredBB = Entry.first; |
| auto *CondSucc = Entry.second; |
| |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| |
| // Find the next valid block. Invalid blocks will be deleted |
| // so they shouldn't be considered fallthrough targets. |
| const auto *NextBlock = BF.getBasicBlockAfter(PredBB, false); |
| while (NextBlock && !isValid(NextBlock)) { |
| NextBlock = BF.getBasicBlockAfter(NextBlock, false); |
| } |
| |
| // Get the unconditional successor to this block. |
| const auto *PredSucc = PredBB->getSuccessor(); |
| assert(PredSucc && "The other branch should be a tail call"); |
| |
| const bool HasFallthrough = (NextBlock && PredSucc == NextBlock); |
| |
| if (UncondBranch) { |
| if (HasFallthrough) |
| PredBB->eraseInstruction(UncondBranch); |
| else |
| MIA->replaceBranchTarget(*UncondBranch, |
| CondSucc->getLabel(), |
| BC.Ctx.get()); |
| } else if (!HasFallthrough) { |
| MCInst Branch; |
| MIA->createUncondBranch(Branch, CondSucc->getLabel(), BC.Ctx.get()); |
| PredBB->addInstruction(Branch); |
| } |
| } |
| |
| if (NumLocalCTCs > 0) { |
| NumDoubleJumps += fixDoubleJumps(BC, BF, true); |
| // Clean-up unreachable tail-call blocks. |
| const auto Stats = BF.eraseInvalidBBs(); |
| DeletedBlocks += Stats.first; |
| DeletedBytes += Stats.second; |
| } |
| |
| DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs |
| << " conditional tail calls from a total of " |
| << NumLocalCTCCandidates << " candidates in function " << BF |
| << ". CTCs execution count for this function is " |
| << LocalCTCExecCount << " and CTC taken count is " |
| << LocalCTCTakenCount << "\n";); |
| |
| NumTailCallsPatched += NumLocalCTCs; |
| NumCandidateTailCalls += NumLocalCTCCandidates; |
| CTCExecCount += LocalCTCExecCount; |
| CTCTakenCount += LocalCTCTakenCount; |
| |
| return NumLocalCTCs > 0; |
| } |
| |
| void SimplifyConditionalTailCalls::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> & |
| ) { |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| |
| if (!shouldOptimize(Function)) |
| continue; |
| |
| if (fixTailCalls(BC, Function)) { |
| Modified.insert(&Function); |
| } |
| } |
| |
| outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched |
| << " tail calls (" << NumOrigForwardBranches << " forward)" |
| << " tail calls (" << NumOrigBackwardBranches << " backward)" |
| << " from a total of " << NumCandidateTailCalls << " while removing " |
| << NumDoubleJumps << " double jumps" |
| << " and removing " << DeletedBlocks << " basic blocks" |
| << " totalling " << DeletedBytes |
| << " bytes of code. CTCs total execution count is " << CTCExecCount |
| << " and the number of times CTCs are taken is " << CTCTakenCount |
| << ".\n"; |
| } |
| |
| uint64_t Peepholes::shortenInstructions(BinaryContext &BC, |
| BinaryFunction &Function) { |
| MCInst DebugInst; |
| uint64_t Count = 0; |
| for (auto &BB : Function) { |
| for (auto &Inst : BB) { |
| if (opts::Verbosity > 1) { |
| DebugInst = Inst; |
| } |
| if (BC.MIA->shortenInstruction(Inst)) { |
| if (opts::Verbosity > 1) { |
| outs() << "BOLT-INFO: peephole, shortening:\n" |
| << "BOLT-INFO: "; |
| BC.printInstruction(outs(), DebugInst, 0, &Function); |
| outs() << "BOLT-INFO: to:"; |
| BC.printInstruction(outs(), Inst, 0, &Function); |
| } |
| ++Count; |
| } |
| } |
| } |
| return Count; |
| } |
| |
| void Peepholes::addTailcallTraps(BinaryContext &BC, |
| BinaryFunction &Function) { |
| for (auto &BB : Function) { |
| auto *Inst = BB.getLastNonPseudoInstr(); |
| if (Inst && BC.MIA->isTailCall(*Inst) && BC.MIA->isIndirectBranch(*Inst)) { |
| MCInst Trap; |
| if (BC.MIA->createTrap(Trap)) { |
| BB.addInstruction(Trap); |
| ++TailCallTraps; |
| } |
| } |
| } |
| } |
| |
| void Peepholes::removeUselessCondBranches(BinaryContext &BC, |
| BinaryFunction &Function) { |
| for (auto &BB : Function) { |
| if (BB.succ_size() != 2) |
| continue; |
| |
| auto *CondBB = BB.getConditionalSuccessor(true); |
| auto *UncondBB = BB.getConditionalSuccessor(false); |
| if (CondBB != UncondBB) |
| continue; |
| |
| const MCSymbol *TBB = nullptr; |
| const MCSymbol *FBB = nullptr; |
| MCInst *CondBranch = nullptr; |
| MCInst *UncondBranch = nullptr; |
| auto Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch); |
| |
| // analyzeBranch() can fail due to unusual branch instructions, |
| // e.g. jrcxz, or jump tables (indirect jump). |
| if (!Result || !CondBranch) |
| continue; |
| |
| BB.removeDuplicateConditionalSuccessor(CondBranch); |
| ++NumUselessCondBranches; |
| } |
| } |
| |
| void Peepholes::runOnFunctions(BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &LargeFunctions) { |
| const char Opts = |
| std::accumulate(opts::Peepholes.begin(), |
| opts::Peepholes.end(), |
| 0, |
| [](const char A, const opts::PeepholeOpts B) { |
| return A | B; |
| }); |
| if (Opts == opts::PEEP_NONE) |
| return; |
| |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| if (shouldOptimize(Function)) { |
| if (Opts & opts::PEEP_SHORTEN) |
| NumShortened += shortenInstructions(BC, Function); |
| if (Opts & opts::PEEP_DOUBLE_JUMPS) |
| NumDoubleJumps += fixDoubleJumps(BC, Function, false); |
| if (Opts & opts::PEEP_TAILCALL_TRAPS) |
| addTailcallTraps(BC, Function); |
| if (Opts & opts::PEEP_USELESS_BRANCHES) |
| removeUselessCondBranches(BC, Function); |
| } |
| } |
| outs() << "BOLT-INFO: Peephole: " << NumShortened |
| << " instructions shortened.\n" |
| << "BOLT-INFO: Peephole: " << NumDoubleJumps |
| << " double jumps patched.\n" |
| << "BOLT-INFO: Peephole: " << TailCallTraps |
| << " tail call traps inserted.\n" |
| << "BOLT-INFO: Peephole: " << NumUselessCondBranches |
| << " useless conditional branches removed.\n"; |
| } |
| |
| bool SimplifyRODataLoads::simplifyRODataLoads( |
| BinaryContext &BC, BinaryFunction &BF) { |
| auto &MIA = BC.MIA; |
| |
| uint64_t NumLocalLoadsSimplified = 0; |
| uint64_t NumDynamicLocalLoadsSimplified = 0; |
| uint64_t NumLocalLoadsFound = 0; |
| uint64_t NumDynamicLocalLoadsFound = 0; |
| |
| for (auto *BB : BF.layout()) { |
| for (auto &Inst : *BB) { |
| unsigned Opcode = Inst.getOpcode(); |
| const MCInstrDesc &Desc = BC.MII->get(Opcode); |
| |
| // Skip instructions that do not load from memory. |
| if (!Desc.mayLoad()) |
| continue; |
| |
| // Try to statically evaluate the target memory address; |
| uint64_t TargetAddress; |
| |
| if (MIA->hasPCRelOperand(Inst)) { |
| // Try to find the symbol that corresponds to the PC-relative operand. |
| auto DispOpI = MIA->getMemOperandDisp(Inst); |
| assert(DispOpI != Inst.end() && "expected PC-relative displacement"); |
| assert(DispOpI->isExpr() && |
| "found PC-relative with non-symbolic displacement"); |
| |
| // Get displacement symbol. |
| const MCSymbolRefExpr *DisplExpr; |
| if (!(DisplExpr = dyn_cast<MCSymbolRefExpr>(DispOpI->getExpr()))) |
| continue; |
| const MCSymbol &DisplSymbol = DisplExpr->getSymbol(); |
| |
| // Look up the symbol address in the global symbols map of the binary |
| // context object. |
| auto GI = BC.GlobalSymbols.find(DisplSymbol.getName()); |
| if (GI == BC.GlobalSymbols.end()) |
| continue; |
| TargetAddress = GI->second; |
| } else if (!MIA->evaluateMemOperandTarget(Inst, TargetAddress)) { |
| continue; |
| } |
| |
| // Get the contents of the section containing the target address of the |
| // memory operand. We are only interested in read-only sections. |
| auto DataSection = BC.getSectionForAddress(TargetAddress); |
| if (!DataSection || !DataSection->isReadOnly()) |
| continue; |
| |
| if (BC.getRelocationAt(TargetAddress)) |
| continue; |
| |
| uint32_t Offset = TargetAddress - DataSection->getAddress(); |
| StringRef ConstantData = DataSection->getContents(); |
| |
| ++NumLocalLoadsFound; |
| if (BB->hasProfile()) |
| NumDynamicLocalLoadsFound += BB->getExecutionCount(); |
| |
| if (MIA->replaceMemOperandWithImm(Inst, ConstantData, Offset)) { |
| ++NumLocalLoadsSimplified; |
| if (BB->hasProfile()) |
| NumDynamicLocalLoadsSimplified += BB->getExecutionCount(); |
| } |
| } |
| } |
| |
| NumLoadsFound += NumLocalLoadsFound; |
| NumDynamicLoadsFound += NumDynamicLocalLoadsFound; |
| NumLoadsSimplified += NumLocalLoadsSimplified; |
| NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified; |
| |
| return NumLocalLoadsSimplified > 0; |
| } |
| |
| void SimplifyRODataLoads::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> & |
| ) { |
| for (auto &It : BFs) { |
| auto &Function = It.second; |
| if (shouldOptimize(Function) && simplifyRODataLoads(BC, Function)) { |
| Modified.insert(&Function); |
| } |
| } |
| |
| outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of " |
| << NumLoadsFound << " loads from a statically computed address.\n" |
| << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified |
| << "\n" |
| << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n"; |
| } |
| |
| void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &) { |
| const auto OriginalFunctionCount = BFs.size(); |
| uint64_t NumFunctionsFolded = 0; |
| uint64_t NumJTFunctionsFolded = 0; |
| uint64_t BytesSavedEstimate = 0; |
| uint64_t CallsSavedEstimate = 0; |
| static bool UseDFS = opts::ICFUseDFS; |
| |
| // This hash table is used to identify identical functions. It maps |
| // a function to a bucket of functions identical to it. |
| struct KeyHash { |
| std::size_t operator()(const BinaryFunction *F) const { |
| return F->hash(/*Recompute=*/false); |
| } |
| }; |
| struct KeyCongruent { |
| bool operator()(const BinaryFunction *A, const BinaryFunction *B) const { |
| return A->isIdenticalWith(*B, /*IgnoreSymbols=*/true, /*UseDFS=*/UseDFS); |
| } |
| }; |
| struct KeyEqual { |
| bool operator()(const BinaryFunction *A, const BinaryFunction *B) const { |
| return A->isIdenticalWith(*B, /*IgnoreSymbols=*/false, /*UseDFS=*/UseDFS); |
| } |
| }; |
| |
| // Create buckets with congruent functions - functions that potentially could |
| // be folded. |
| std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>, |
| KeyHash, KeyCongruent> CongruentBuckets; |
| for (auto &BFI : BFs) { |
| auto &BF = BFI.second; |
| if (!shouldOptimize(BF) || BF.isFolded()) |
| continue; |
| |
| // Make sure indices are in-order. |
| BF.updateLayoutIndices(); |
| |
| // Pre-compute hash before pushing into hashtable. |
| BF.hash(/*Recompute=*/true, /*UseDFS=*/UseDFS); |
| |
| CongruentBuckets[&BF].emplace(&BF); |
| } |
| |
| // We repeat the pass until no new modifications happen. |
| unsigned Iteration = 1; |
| uint64_t NumFoldedLastIteration; |
| do { |
| NumFoldedLastIteration = 0; |
| |
| DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n"); |
| |
| for (auto &CBI : CongruentBuckets) { |
| auto &Candidates = CBI.second; |
| if (Candidates.size() < 2) |
| continue; |
| |
| // Identical functions go into the same bucket. |
| std::unordered_map<BinaryFunction *, std::vector<BinaryFunction *>, |
| KeyHash, KeyEqual> IdenticalBuckets; |
| for (auto *BF : Candidates) { |
| IdenticalBuckets[BF].emplace_back(BF); |
| } |
| |
| for (auto &IBI : IdenticalBuckets) { |
| // Functions identified as identical. |
| auto &Twins = IBI.second; |
| if (Twins.size() < 2) |
| continue; |
| |
| // Fold functions. Keep the order consistent across invocations with |
| // different options. |
| std::stable_sort(Twins.begin(), Twins.end(), |
| [](const BinaryFunction *A, const BinaryFunction *B) { |
| return A->getFunctionNumber() < B->getFunctionNumber(); |
| }); |
| |
| BinaryFunction *ParentBF = Twins[0]; |
| for (unsigned i = 1; i < Twins.size(); ++i) { |
| auto *ChildBF = Twins[i]; |
| DEBUG(dbgs() << "BOLT-DEBUG: folding " << *ChildBF << " into " |
| << *ParentBF << '\n'); |
| |
| // Remove child function from the list of candidates. |
| auto FI = Candidates.find(ChildBF); |
| assert(FI != Candidates.end() && |
| "function expected to be in the set"); |
| Candidates.erase(FI); |
| |
| // Fold the function and remove from the list of processed functions. |
| BytesSavedEstimate += ChildBF->getSize(); |
| CallsSavedEstimate += std::min(ChildBF->getKnownExecutionCount(), |
| ParentBF->getKnownExecutionCount()); |
| BC.foldFunction(*ChildBF, *ParentBF, BFs); |
| |
| ++NumFoldedLastIteration; |
| |
| if (ParentBF->hasJumpTables()) |
| ++NumJTFunctionsFolded; |
| } |
| } |
| |
| } |
| NumFunctionsFolded += NumFoldedLastIteration; |
| ++Iteration; |
| |
| } while (NumFoldedLastIteration > 0); |
| |
| DEBUG( |
| // Print functions that are congruent but not identical. |
| for (auto &CBI : CongruentBuckets) { |
| auto &Candidates = CBI.second; |
| if (Candidates.size() < 2) |
| continue; |
| dbgs() << "BOLT-DEBUG: the following " << Candidates.size() |
| << " functions (each of size " << (*Candidates.begin())->getSize() |
| << " bytes) are congruent but not identical:\n"; |
| for (auto *BF : Candidates) { |
| dbgs() << " " << *BF; |
| if (BF->getKnownExecutionCount()) { |
| dbgs() << " (executed " << BF->getKnownExecutionCount() << " times)"; |
| } |
| dbgs() << '\n'; |
| } |
| } |
| ); |
| |
| if (NumFunctionsFolded) { |
| outs() << "BOLT-INFO: ICF folded " << NumFunctionsFolded |
| << " out of " << OriginalFunctionCount << " functions in " |
| << Iteration << " passes. " |
| << NumJTFunctionsFolded << " functions had jump tables.\n" |
| << "BOLT-INFO: Removing all identical functions will save " |
| << format("%.2lf", (double) BytesSavedEstimate / 1024) |
| << " KB of code space. Folded functions were called " |
| << CallsSavedEstimate << " times based on profile.\n"; |
| } |
| } |
| |
| void |
| PrintProgramStats::runOnFunctions(BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &) { |
| uint64_t NumSimpleFunctions{0}; |
| uint64_t NumStaleProfileFunctions{0}; |
| std::vector<BinaryFunction *> ProfiledFunctions; |
| const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; |
| for (auto &BFI : BFs) { |
| auto &Function = BFI.second; |
| if (!Function.isSimple()) |
| continue; |
| ++NumSimpleFunctions; |
| if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE) |
| continue; |
| if (Function.hasValidProfile()) |
| ProfiledFunctions.push_back(&Function); |
| else { |
| if (opts::ReportStaleFuncs) { |
| outs() << StaleFuncsHeader; |
| StaleFuncsHeader = ""; |
| outs() << " " << Function << '\n'; |
| } |
| ++NumStaleProfileFunctions; |
| } |
| } |
| BC.NumProfiledFuncs = ProfiledFunctions.size(); |
| |
| const auto NumAllProfiledFunctions = |
| ProfiledFunctions.size() + NumStaleProfileFunctions; |
| outs() << "BOLT-INFO: " |
| << NumAllProfiledFunctions |
| << " functions out of " << NumSimpleFunctions << " simple functions (" |
| << format("%.1f", NumAllProfiledFunctions / |
| (float) NumSimpleFunctions * 100.0f) |
| << "%) have non-empty execution profile.\n"; |
| if (NumStaleProfileFunctions) { |
| outs() << "BOLT-INFO: " << NumStaleProfileFunctions |
| << format(" (%.1f%% of all profiled)", |
| NumStaleProfileFunctions / |
| (float) NumAllProfiledFunctions * 100.0f) |
| << " function" << (NumStaleProfileFunctions == 1 ? "" : "s") |
| << " have invalid (possibly stale) profile.\n"; |
| } |
| |
| // Profile is marked as 'Used' if it either matches a function name |
| // exactly or if it 100% matches any of functions with matching common |
| // LTO names. |
| auto getUnusedObjects = [&]() -> Optional<std::vector<StringRef>> { |
| std::vector<StringRef> UnusedObjects; |
| for (const auto &Func : BC.DR.getAllFuncsData()) { |
| if (!Func.getValue().Used) { |
| UnusedObjects.emplace_back(Func.getKey()); |
| } |
| } |
| if (UnusedObjects.empty()) |
| return NoneType(); |
| return UnusedObjects; |
| }; |
| |
| if (const auto UnusedObjects = getUnusedObjects()) { |
| outs() << "BOLT-INFO: profile for " << UnusedObjects->size() |
| << " objects was ignored\n"; |
| if (opts::Verbosity >= 1) { |
| for (auto Name : *UnusedObjects) { |
| outs() << " " << Name << '\n'; |
| } |
| } |
| } |
| |
| if (ProfiledFunctions.size() > 10) { |
| if (opts::Verbosity >= 1) { |
| outs() << "BOLT-INFO: top called functions are:\n"; |
| std::sort(ProfiledFunctions.begin(), ProfiledFunctions.end(), |
| [](BinaryFunction *A, BinaryFunction *B) { |
| return B->getExecutionCount() < A->getExecutionCount(); |
| } |
| ); |
| auto SFI = ProfiledFunctions.begin(); |
| auto SFIend = ProfiledFunctions.end(); |
| for (auto i = 0u; i < opts::TopCalledLimit && SFI != SFIend; ++SFI, ++i) { |
| outs() << " " << **SFI << " : " |
| << (*SFI)->getExecutionCount() << '\n'; |
| } |
| } |
| } |
| |
| if (!opts::PrintSortedBy.empty() && |
| std::find(opts::PrintSortedBy.begin(), |
| opts::PrintSortedBy.end(), |
| DynoStats::FIRST_DYNO_STAT) == opts::PrintSortedBy.end()) { |
| |
| std::vector<const BinaryFunction *> Functions; |
| std::map<const BinaryFunction *, DynoStats> Stats; |
| |
| for (const auto &BFI : BFs) { |
| const auto &BF = BFI.second; |
| if (shouldOptimize(BF) && BF.hasValidProfile()) { |
| Functions.push_back(&BF); |
| Stats.emplace(&BF, BF.getDynoStats()); |
| } |
| } |
| |
| const bool SortAll = |
| std::find(opts::PrintSortedBy.begin(), |
| opts::PrintSortedBy.end(), |
| DynoStats::LAST_DYNO_STAT) != opts::PrintSortedBy.end(); |
| |
| const bool Ascending = |
| opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending; |
| |
| if (SortAll) { |
| std::stable_sort( |
| Functions.begin(), |
| Functions.end(), |
| [Ascending,&Stats](const BinaryFunction *A, const BinaryFunction *B) { |
| return Ascending ? |
| Stats.at(A) < Stats.at(B) : Stats.at(B) < Stats.at(A); |
| } |
| ); |
| } else { |
| std::stable_sort( |
| Functions.begin(), |
| Functions.end(), |
| [Ascending,&Stats](const BinaryFunction *A, const BinaryFunction *B) { |
| const auto &StatsA = Stats.at(A); |
| const auto &StatsB = Stats.at(B); |
| return Ascending |
| ? StatsA.lessThan(StatsB, opts::PrintSortedBy) |
| : StatsB.lessThan(StatsA, opts::PrintSortedBy); |
| } |
| ); |
| } |
| |
| outs() << "BOLT-INFO: top functions sorted by "; |
| if (SortAll) { |
| outs() << "dyno stats"; |
| } else { |
| outs() << "("; |
| bool PrintComma = false; |
| for (const auto Category : opts::PrintSortedBy) { |
| if (PrintComma) outs() << ", "; |
| outs() << DynoStats::Description(Category); |
| PrintComma = true; |
| } |
| outs() << ")"; |
| } |
| |
| outs() << " are:\n"; |
| auto SFI = Functions.begin(); |
| for (unsigned i = 0; i < 100 && SFI != Functions.end(); ++SFI, ++i) { |
| const auto Stats = (*SFI)->getDynoStats(); |
| outs() << " " << **SFI; |
| if (!SortAll) { |
| outs() << " ("; |
| bool PrintComma = false; |
| for (const auto Category : opts::PrintSortedBy) { |
| if (PrintComma) outs() << ", "; |
| outs() << dynoStatsOptName(Category) << "=" << Stats[Category]; |
| PrintComma = true; |
| } |
| outs() << ")"; |
| } |
| outs() << "\n"; |
| } |
| } |
| } |
| |
| void InstructionLowering::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &LargeFunctions) { |
| for (auto &BFI : BFs) { |
| for (auto &BB : BFI.second) { |
| for (auto &Instruction : BB) { |
| BC.MIA->lowerTailCall(Instruction); |
| } |
| } |
| } |
| } |
| |
| void StripRepRet::runOnFunctions( |
| BinaryContext &BC, |
| std::map<uint64_t, BinaryFunction> &BFs, |
| std::set<uint64_t> &LargeFunctions) { |
| uint64_t NumPrefixesRemoved = 0; |
| uint64_t NumBytesSaved = 0; |
| for (auto &BFI : BFs) { |
| for (auto &BB : BFI.second) { |
| auto LastInstRIter = BB.getLastNonPseudo(); |
| if (LastInstRIter == BB.rend() || |
| !BC.MIA->isReturn(*LastInstRIter) || |
| !BC.MIA->deleteREPPrefix(*LastInstRIter)) |
| continue; |
| |
| NumPrefixesRemoved += BB.getKnownExecutionCount(); |
| ++NumBytesSaved; |
| } |
| } |
| |
| if (NumBytesSaved) { |
| outs() << "BOLT-INFO: removed " << NumBytesSaved << " 'repz' prefixes" |
| " with estimated execution count of " << NumPrefixesRemoved |
| << " times.\n"; |
| } |
| } |
| |
| } // namespace bolt |
| } // namespace llvm |