| //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the SplitFunctions pass. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "bolt/Passes/SplitFunctions.h" |
| #include "bolt/Core/BinaryBasicBlock.h" |
| #include "bolt/Core/BinaryFunction.h" |
| #include "bolt/Core/FunctionLayout.h" |
| #include "bolt/Core/ParallelUtilities.h" |
| #include "bolt/Utils/CommandLineOpts.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/iterator_range.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/FormatVariadic.h" |
| #include <algorithm> |
| #include <iterator> |
| #include <memory> |
| #include <numeric> |
| #include <random> |
| #include <vector> |
| |
| #define DEBUG_TYPE "bolt-opts" |
| |
| using namespace llvm; |
| using namespace bolt; |
| |
| namespace { |
| class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { |
| public: |
| explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) |
| : cl::parser<bool>(O) {} |
| |
| bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { |
| if (Arg == "2" || Arg == "3") { |
| Value = true; |
| errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " |
| "for option -{1} is deprecated\n", |
| Arg, ArgName); |
| return false; |
| } |
| return cl::parser<bool>::parse(O, ArgName, Arg, Value); |
| } |
| }; |
| } // namespace |
| |
| namespace opts { |
| |
| extern cl::OptionCategory BoltOptCategory; |
| |
| extern cl::opt<bool> SplitEH; |
| extern cl::opt<unsigned> ExecutionCountThreshold; |
| extern cl::opt<uint32_t> RandomSeed; |
| |
| static cl::opt<bool> AggressiveSplitting( |
| "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> SplitAlignThreshold( |
| "split-align-threshold", |
| cl::desc("when deciding to split a function, apply this alignment " |
| "while doing the size comparison (see -split-threshold). " |
| "Default value: 2."), |
| cl::init(2), |
| |
| cl::Hidden, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> |
| SplitFunctions("split-functions", |
| cl::desc("split functions into fragments"), |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<unsigned> SplitThreshold( |
| "split-threshold", |
| cl::desc("split function only if its main size is reduced by more than " |
| "given amount of bytes. Default value: 0, i.e. split iff the " |
| "size is reduced. Note that on some architectures the size can " |
| "increase after splitting."), |
| cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<SplitFunctionsStrategy> SplitStrategy( |
| "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), |
| cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", |
| "split each function into a hot and cold fragment " |
| "using profiling information")), |
| cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", |
| "split each function into a hot, warm, and cold " |
| "fragment using profiling information")), |
| cl::values(clEnumValN( |
| SplitFunctionsStrategy::Random2, "random2", |
| "split each function into a hot and cold fragment at a randomly chosen " |
| "split point (ignoring any available profiling information)")), |
| cl::values(clEnumValN( |
| SplitFunctionsStrategy::RandomN, "randomN", |
| "split each function into N fragments at a randomly chosen split " |
| "points (ignoring any available profiling information)")), |
| cl::values(clEnumValN( |
| SplitFunctionsStrategy::All, "all", |
| "split all basic blocks of each function into fragments such that each " |
| "fragment contains exactly a single basic block")), |
| cl::desc("strategy used to partition blocks into fragments"), |
| cl::cat(BoltOptCategory)); |
| |
| static cl::opt<double> CallScale( |
| "call-scale", |
| cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"), |
| cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<double> |
| CallPower("call-power", |
| cl::desc("Call score power (when --split-strategy=cdsplit)"), |
| cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| |
| static cl::opt<double> |
| JumpPower("jump-power", |
| cl::desc("Jump score power (when --split-strategy=cdsplit)"), |
| cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| } // namespace opts |
| |
| namespace { |
| bool hasFullProfile(const BinaryFunction &BF) { |
| return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { |
| return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; |
| }); |
| } |
| |
| bool allBlocksCold(const BinaryFunction &BF) { |
| return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { |
| return BB.getExecutionCount() == 0; |
| }); |
| } |
| |
| struct SplitProfile2 final : public SplitStrategy { |
| bool canSplit(const BinaryFunction &BF) override { |
| return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); |
| } |
| |
| bool compactFragments() override { return true; } |
| |
| void fragment(const BlockIt Start, const BlockIt End) override { |
| for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { |
| if (BB->getExecutionCount() == 0) |
| BB->setFragmentNum(FragmentNum::cold()); |
| } |
| } |
| }; |
| |
| struct SplitCacheDirected final : public SplitStrategy { |
| BinaryContext &BC; |
| using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; |
| |
| bool canSplit(const BinaryFunction &BF) override { |
| return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); |
| } |
| |
| explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) { |
| initializeAuxiliaryVariables(); |
| buildCallGraph(); |
| } |
| |
| // When some functions are hot-warm split and others are hot-warm-cold split, |
| // we do not want to change the fragment numbers of the blocks in the hot-warm |
| // split functions. |
| bool compactFragments() override { return false; } |
| |
| void fragment(const BlockIt Start, const BlockIt End) override { |
| BasicBlockOrder BlockOrder(Start, End); |
| BinaryFunction &BF = *BlockOrder.front()->getFunction(); |
| // No need to re-split small functions. |
| if (BlockOrder.size() <= 2) |
| return; |
| |
| size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); |
| assert(BestSplitIndex < BlockOrder.size()); |
| |
| // Assign fragments based on the computed best split index. |
| // All basic blocks with index up to the best split index become hot. |
| // All remaining blocks are warm / cold depending on if count is |
| // greater than zero or not. |
| for (size_t Index = 0; Index < BlockOrder.size(); Index++) { |
| BinaryBasicBlock *BB = BlockOrder[Index]; |
| if (Index <= BestSplitIndex) |
| BB->setFragmentNum(FragmentNum::main()); |
| else |
| BB->setFragmentNum(BB->getKnownExecutionCount() > 0 |
| ? FragmentNum::warm() |
| : FragmentNum::cold()); |
| } |
| } |
| |
| private: |
| struct CallInfo { |
| size_t Length; |
| size_t Count; |
| }; |
| |
| struct SplitScore { |
| size_t SplitIndex = size_t(-1); |
| size_t HotSizeReduction = 0; |
| double LocalScore = 0; |
| double CoverCallScore = 0; |
| |
| double sum() const { return LocalScore + CoverCallScore; } |
| }; |
| |
| // Auxiliary variables used by the algorithm. |
| size_t TotalNumBlocks{0}; |
| size_t OrigHotSectionSize{0}; |
| DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices; |
| DenseMap<const BinaryBasicBlock *, size_t> BBSizes; |
| DenseMap<const BinaryBasicBlock *, size_t> BBOffsets; |
| |
| // Call graph. |
| std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers; |
| std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees; |
| |
| bool shouldConsiderForCallGraph(const BinaryFunction &BF) { |
| // Only a subset of the functions in the binary will be considered |
| // for initializing auxiliary variables and building call graph. |
| return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty(); |
| } |
| |
| void initializeAuxiliaryVariables() { |
| for (BinaryFunction *BF : BC.getSortedFunctions()) { |
| if (!shouldConsiderForCallGraph(*BF)) |
| continue; |
| |
| // Calculate the size of each BB after hot-cold splitting. |
| // This populates BinaryBasicBlock::OutputAddressRange which |
| // can be used to compute the size of each BB. |
| BC.calculateEmittedSize(*BF, /*FixBranches=*/true); |
| |
| for (BinaryBasicBlock *BB : BF->getLayout().blocks()) { |
| // Unique global index. |
| GlobalIndices[BB] = TotalNumBlocks; |
| TotalNumBlocks++; |
| |
| // Block size after hot-cold splitting. |
| BBSizes[BB] = BB->getOutputSize(); |
| |
| // Hot block offset after hot-cold splitting. |
| BBOffsets[BB] = OrigHotSectionSize; |
| if (!BB->isSplit()) |
| OrigHotSectionSize += BBSizes[BB]; |
| } |
| } |
| } |
| |
| void buildCallGraph() { |
| Callers.resize(TotalNumBlocks); |
| Callees.resize(TotalNumBlocks); |
| for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) { |
| if (!shouldConsiderForCallGraph(*SrcFunction)) |
| continue; |
| |
| for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) { |
| // Skip blocks that are not executed |
| if (SrcBB.getKnownExecutionCount() == 0) |
| continue; |
| |
| // Find call instructions and extract target symbols from each one |
| for (const MCInst &Inst : SrcBB) { |
| if (!BC.MIB->isCall(Inst)) |
| continue; |
| |
| // Call info |
| const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); |
| // Ignore calls w/o information |
| if (!DstSym) |
| continue; |
| |
| const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); |
| // Ignore calls that do not have a valid target, but do not ignore |
| // recursive calls, because caller block could be moved to warm. |
| if (!DstFunction || DstFunction->getLayout().block_empty()) |
| continue; |
| |
| const BinaryBasicBlock *DstBB = &(DstFunction->front()); |
| |
| // Record the call only if DstBB is also in functions to consider for |
| // call graph. |
| if (GlobalIndices.contains(DstBB)) { |
| Callers[GlobalIndices[DstBB]].push_back(&SrcBB); |
| Callees[GlobalIndices[&SrcBB]].push_back(DstBB); |
| } |
| } |
| } |
| } |
| } |
| |
| /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block |
| /// start and end addresses for hot and warm basic blocks, assuming hot-warm |
| /// splitting happens at \p SplitIndex. Also return estimated end addresses |
| /// of the hot fragment before and after splitting. |
| /// The estimations take into account the potential addition of branch |
| /// instructions due to split fall through branches as well as the need to |
| /// use longer branch instructions for split (un)conditional branches. |
| std::pair<size_t, size_t> |
| estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder, |
| const size_t SplitIndex) { |
| assert(SplitIndex < BlockOrder.size() && "Invalid split index"); |
| |
| // Update function layout assuming hot-warm splitting at SplitIndex. |
| for (size_t Index = 0; Index < BlockOrder.size(); Index++) { |
| BinaryBasicBlock *BB = BlockOrder[Index]; |
| if (BB->getFragmentNum() == FragmentNum::cold()) |
| break; |
| BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main() |
| : FragmentNum::warm()); |
| } |
| BinaryFunction *BF = BlockOrder[0]->getFunction(); |
| BF->getLayout().update(BlockOrder); |
| // Populate BB.OutputAddressRange under the updated layout. |
| BC.calculateEmittedSize(*BF); |
| |
| // Populate BB.OutputAddressRange with estimated new start and end addresses |
| // and compute the old end address of the hot section and the new end |
| // address of the hot section. |
| size_t OldHotEndAddr{0}; |
| size_t NewHotEndAddr{0}; |
| size_t CurrentAddr = BBOffsets[BlockOrder[0]]; |
| for (BinaryBasicBlock *BB : BlockOrder) { |
| // We only care about new addresses of blocks in hot/warm. |
| if (BB->getFragmentNum() == FragmentNum::cold()) |
| break; |
| const size_t NewSize = BB->getOutputSize(); |
| BB->setOutputStartAddress(CurrentAddr); |
| CurrentAddr += NewSize; |
| BB->setOutputEndAddress(CurrentAddr); |
| if (BB->getLayoutIndex() == SplitIndex) { |
| NewHotEndAddr = CurrentAddr; |
| // Approximate the start address of the warm fragment of the current |
| // function using the original hot section size. |
| CurrentAddr = OrigHotSectionSize; |
| } |
| OldHotEndAddr = BBOffsets[BB] + BBSizes[BB]; |
| } |
| return std::make_pair(OldHotEndAddr, NewHotEndAddr); |
| } |
| |
| /// Get a collection of "shortenable" calls, that is, calls of type X->Y |
| /// when the function order is [... X ... BF ... Y ...]. |
| /// If the hot fragment size of BF is reduced, then such calls are guaranteed |
| /// to get shorter by the reduced hot fragment size. |
| std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) { |
| // Record the length and the count of the calls that can be shortened |
| std::vector<CallInfo> CoverCalls; |
| if (opts::CallScale == 0) |
| return CoverCalls; |
| |
| const BinaryFunction *ThisBF = &BF; |
| const BinaryBasicBlock *ThisBB = &(ThisBF->front()); |
| const size_t ThisGI = GlobalIndices[ThisBB]; |
| |
| for (const BinaryFunction *DstBF : BC.getSortedFunctions()) { |
| if (!shouldConsiderForCallGraph(*DstBF)) |
| continue; |
| |
| const BinaryBasicBlock *DstBB = &(DstBF->front()); |
| if (DstBB->getKnownExecutionCount() == 0) |
| continue; |
| |
| const size_t DstGI = GlobalIndices[DstBB]; |
| for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) { |
| const BinaryFunction *SrcBF = SrcBB->getFunction(); |
| if (ThisBF == SrcBF) |
| continue; |
| |
| const size_t CallCount = SrcBB->getKnownExecutionCount(); |
| |
| const size_t SrcGI = GlobalIndices[SrcBB]; |
| |
| const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) || |
| (DstGI <= ThisGI && ThisGI < SrcGI); |
| if (!IsCoverCall) |
| continue; |
| |
| const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB]; |
| const size_t DstBBStartAddr = BBOffsets[DstBB]; |
| const size_t CallLength = |
| AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); |
| const CallInfo CI{CallLength, CallCount}; |
| CoverCalls.emplace_back(CI); |
| } |
| } |
| return CoverCalls; |
| } |
| |
| /// Compute the edge score of a call edge. |
| double computeCallScore(uint64_t CallCount, size_t CallLength) { |
| // Increase call lengths by 1 to avoid raising 0 to a negative power. |
| return opts::CallScale * static_cast<double>(CallCount) / |
| std::pow(static_cast<double>(CallLength + 1), opts::CallPower); |
| } |
| |
| /// Compute the edge score of a jump (branch) edge. |
| double computeJumpScore(uint64_t JumpCount, size_t JumpLength) { |
| // Increase jump lengths by 1 to avoid raising 0 to a negative power. |
| return static_cast<double>(JumpCount) / |
| std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower); |
| } |
| |
| /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex. |
| /// Increament Score.LocalScore in place by the sum. |
| void computeJumpScore(const BasicBlockOrder &BlockOrder, |
| const size_t SplitIndex, SplitScore &Score) { |
| |
| for (const BinaryBasicBlock *SrcBB : BlockOrder) { |
| if (SrcBB->getKnownExecutionCount() == 0) |
| continue; |
| |
| const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; |
| |
| for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) { |
| const BinaryBasicBlock *DstBB = std::get<0>(Pair); |
| const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); |
| const size_t JumpCount = Branch.Count; |
| |
| if (JumpCount == 0) |
| continue; |
| |
| const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first; |
| const size_t NewJumpLength = |
| AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); |
| Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength); |
| } |
| } |
| } |
| |
| /// Compute sum of scores over calls originated in the current function |
| /// given \p SplitIndex. Increament Score.LocalScore in place by the sum. |
| void computeLocalCallScore(const BasicBlockOrder &BlockOrder, |
| const size_t SplitIndex, SplitScore &Score) { |
| if (opts::CallScale == 0) |
| return; |
| |
| // Global index of the last block in the current function. |
| // This is later used to determine whether a call originated in the current |
| // function is to a function that comes after the current function. |
| const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()]; |
| |
| // The length of calls originated in the input function can increase / |
| // decrease depending on the splitting decision. |
| for (const BinaryBasicBlock *SrcBB : BlockOrder) { |
| const size_t CallCount = SrcBB->getKnownExecutionCount(); |
| // If SrcBB does not call any functions, skip it. |
| if (CallCount == 0) |
| continue; |
| |
| // Obtain an estimate on the end address of the src basic block |
| // after splitting at SplitIndex. |
| const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; |
| |
| for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) { |
| // Obtain an estimate on the start address of the dst basic block |
| // after splitting at SplitIndex. If DstBB is in a function before |
| // the current function, then its start address remains unchanged. |
| size_t DstBBStartAddr = BBOffsets[DstBB]; |
| // If DstBB is in a function after the current function, then its |
| // start address should be adjusted based on the reduction in hot size. |
| if (GlobalIndices[DstBB] > LastGlobalIndex) { |
| assert(DstBBStartAddr >= Score.HotSizeReduction); |
| DstBBStartAddr -= Score.HotSizeReduction; |
| } |
| const size_t NewCallLength = |
| AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); |
| Score.LocalScore += computeCallScore(CallCount, NewCallLength); |
| } |
| } |
| } |
| |
| /// Compute sum of splitting scores for cover calls of the input function. |
| /// Increament Score.CoverCallScore in place by the sum. |
| void computeCoverCallScore(const BasicBlockOrder &BlockOrder, |
| const size_t SplitIndex, |
| const std::vector<CallInfo> &CoverCalls, |
| SplitScore &Score) { |
| if (opts::CallScale == 0) |
| return; |
| |
| for (const CallInfo CI : CoverCalls) { |
| assert(CI.Length >= Score.HotSizeReduction && |
| "Length of cover calls must exceed reduced size of hot fragment."); |
| // Compute the new length of the call, which is shorter than the original |
| // one by the size of the splitted fragment minus the total size increase. |
| const size_t NewCallLength = CI.Length - Score.HotSizeReduction; |
| Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength); |
| } |
| } |
| |
| /// Compute the split score of splitting a function at a given index. |
| /// The split score consists of local score and cover score. This function |
| /// returns \p Score of SplitScore type. It contains the local score and |
| /// cover score of the current splitting index. For easier book keeping and |
| /// comparison, it also stores the split index and the resulting reduction |
| /// in hot fragment size. |
| SplitScore computeSplitScore(const BinaryFunction &BF, |
| const BasicBlockOrder &BlockOrder, |
| const size_t SplitIndex, |
| const std::vector<CallInfo> &CoverCalls) { |
| // Populate BinaryBasicBlock::OutputAddressRange with estimated |
| // new start and end addresses after hot-warm splitting at SplitIndex. |
| size_t OldHotEnd; |
| size_t NewHotEnd; |
| std::tie(OldHotEnd, NewHotEnd) = |
| estimatePostSplitBBAddress(BlockOrder, SplitIndex); |
| |
| SplitScore Score; |
| Score.SplitIndex = SplitIndex; |
| |
| // It's not worth splitting if OldHotEnd < NewHotEnd. |
| if (OldHotEnd < NewHotEnd) |
| return Score; |
| |
| // Hot fragment size reduction due to splitting. |
| Score.HotSizeReduction = OldHotEnd - NewHotEnd; |
| |
| // First part of LocalScore is the sum over call edges originated in the |
| // input function. These edges can get shorter or longer depending on |
| // SplitIndex. Score.LocalScore is increamented in place. |
| computeLocalCallScore(BlockOrder, SplitIndex, Score); |
| |
| // Second part of LocalScore is the sum over jump edges with src basic block |
| // and dst basic block in the current function. Score.LocalScore is |
| // increamented in place. |
| computeJumpScore(BlockOrder, SplitIndex, Score); |
| |
| // Compute CoverCallScore and store in Score in place. |
| computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score); |
| return Score; |
| } |
| |
| /// Find the most likely successor of a basic block when it has one or two |
| /// successors. Return nullptr otherwise. |
| const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) { |
| if (BB->succ_size() == 1) |
| return BB->getSuccessor(); |
| if (BB->succ_size() == 2) { |
| uint64_t TakenCount = BB->getTakenBranchInfo().Count; |
| assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); |
| uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count; |
| assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); |
| if (TakenCount > NonTakenCount) |
| return BB->getConditionalSuccessor(true); |
| else if (TakenCount < NonTakenCount) |
| return BB->getConditionalSuccessor(false); |
| } |
| return nullptr; |
| } |
| |
| /// Find the best index for splitting. The returned value is the index of the |
| /// last hot basic block. Hence, "no splitting" is equivalent to returning the |
| /// value which is one less than the size of the function. |
| size_t findSplitIndex(const BinaryFunction &BF, |
| const BasicBlockOrder &BlockOrder) { |
| assert(BlockOrder.size() > 2); |
| // Find all function calls that can be shortened if we move blocks of the |
| // current function to warm/cold |
| const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF); |
| |
| // Find the existing hot-cold splitting index. |
| size_t HotColdIndex = 0; |
| while (HotColdIndex + 1 < BlockOrder.size()) { |
| if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold()) |
| break; |
| HotColdIndex++; |
| } |
| assert(HotColdIndex + 1 == BlockOrder.size() || |
| (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() && |
| BlockOrder[HotColdIndex + 1]->getFragmentNum() == |
| FragmentNum::cold())); |
| |
| // Try all possible split indices up to HotColdIndex (blocks that have |
| // Index <= SplitIndex are in hot) and find the one maximizing the |
| // splitting score. |
| SplitScore BestScore; |
| for (size_t Index = 0; Index <= HotColdIndex; Index++) { |
| const BinaryBasicBlock *LastHotBB = BlockOrder[Index]; |
| assert(LastHotBB->getFragmentNum() != FragmentNum::cold()); |
| |
| // Do not break jump to the most likely successor. |
| if (Index + 1 < BlockOrder.size() && |
| BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB)) |
| continue; |
| |
| const SplitScore Score = |
| computeSplitScore(BF, BlockOrder, Index, CoverCalls); |
| if (Score.sum() > BestScore.sum()) |
| BestScore = Score; |
| } |
| |
| // If we don't find a good splitting point, fallback to the original one. |
| if (BestScore.SplitIndex == size_t(-1)) |
| return HotColdIndex; |
| |
| return BestScore.SplitIndex; |
| } |
| }; |
| |
| struct SplitRandom2 final : public SplitStrategy { |
| std::minstd_rand0 Gen; |
| |
| SplitRandom2() : Gen(opts::RandomSeed.getValue()) {} |
| |
| bool canSplit(const BinaryFunction &BF) override { return true; } |
| |
| bool compactFragments() override { return true; } |
| |
| void fragment(const BlockIt Start, const BlockIt End) override { |
| using DiffT = typename std::iterator_traits<BlockIt>::difference_type; |
| const DiffT NumBlocks = End - Start; |
| assert(NumBlocks > 0 && "Cannot fragment empty function"); |
| |
| // We want to split at least one block |
| const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1); |
| std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint); |
| const DiffT SplitPoint = Dist(Gen); |
| for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End)) |
| BB->setFragmentNum(FragmentNum::cold()); |
| |
| LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " |
| "{1} possible) blocks to split\n", |
| NumBlocks - SplitPoint, End - Start)); |
| } |
| }; |
| |
| struct SplitRandomN final : public SplitStrategy { |
| std::minstd_rand0 Gen; |
| |
| SplitRandomN() : Gen(opts::RandomSeed.getValue()) {} |
| |
| bool canSplit(const BinaryFunction &BF) override { return true; } |
| |
| bool compactFragments() override { return true; } |
| |
| void fragment(const BlockIt Start, const BlockIt End) override { |
| using DiffT = typename std::iterator_traits<BlockIt>::difference_type; |
| const DiffT NumBlocks = End - Start; |
| assert(NumBlocks > 0 && "Cannot fragment empty function"); |
| |
| // With n blocks, there are n-1 places to split them. |
| const DiffT MaximumSplits = NumBlocks - 1; |
| // We want to generate at least two fragment if possible, but if there is |
| // only one block, no splits are possible. |
| const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1); |
| std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits); |
| // Choose how many splits to perform |
| const DiffT NumSplits = Dist(Gen); |
| |
| // Draw split points from a lottery |
| SmallVector<unsigned, 0> Lottery(MaximumSplits); |
| // Start lottery at 1, because there is no meaningful splitpoint before the |
| // first block. |
| std::iota(Lottery.begin(), Lottery.end(), 1u); |
| std::shuffle(Lottery.begin(), Lottery.end(), Gen); |
| Lottery.resize(NumSplits); |
| llvm::sort(Lottery); |
| |
| // Add one past the end entry to lottery |
| Lottery.push_back(NumBlocks); |
| |
| unsigned LotteryIndex = 0; |
| unsigned BBPos = 0; |
| for (BinaryBasicBlock *const BB : make_range(Start, End)) { |
| // Check whether to start new fragment |
| if (BBPos >= Lottery[LotteryIndex]) |
| ++LotteryIndex; |
| |
| // Because LotteryIndex is 0 based and cold fragments are 1 based, we can |
| // use the index to assign fragments. |
| BB->setFragmentNum(FragmentNum(LotteryIndex)); |
| |
| ++BBPos; |
| } |
| } |
| }; |
| |
| struct SplitAll final : public SplitStrategy { |
| bool canSplit(const BinaryFunction &BF) override { return true; } |
| |
| bool compactFragments() override { |
| // Keeping empty fragments allows us to test, that empty fragments do not |
| // generate symbols. |
| return false; |
| } |
| |
| void fragment(const BlockIt Start, const BlockIt End) override { |
| unsigned Fragment = 0; |
| for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) |
| BB->setFragmentNum(FragmentNum(Fragment++)); |
| } |
| }; |
| } // namespace |
| |
| namespace llvm { |
| namespace bolt { |
| |
| bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { |
| // Apply execution count threshold |
| if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) |
| return false; |
| |
| return BinaryFunctionPass::shouldOptimize(BF); |
| } |
| |
| Error SplitFunctions::runOnFunctions(BinaryContext &BC) { |
| if (!opts::SplitFunctions) |
| return Error::success(); |
| |
| if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) { |
| BC.errs() << "BOLT-ERROR: split functions require reserved space in the " |
| "Linux kernel binary\n"; |
| exit(1); |
| } |
| |
| // If split strategy is not CDSplit, then a second run of the pass is not |
| // needed after function reordering. |
| if (BC.HasFinalizedFunctionOrder && |
| opts::SplitStrategy != SplitFunctionsStrategy::CDSplit) |
| return Error::success(); |
| |
| std::unique_ptr<SplitStrategy> Strategy; |
| bool ForceSequential = false; |
| |
| switch (opts::SplitStrategy) { |
| case SplitFunctionsStrategy::CDSplit: |
| // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2) |
| // before function reordering and hot-warm-cold splitting |
| // (SplitCacheDirected) after function reordering. |
| if (BC.HasFinalizedFunctionOrder) |
| Strategy = std::make_unique<SplitCacheDirected>(BC); |
| else |
| Strategy = std::make_unique<SplitProfile2>(); |
| opts::AggressiveSplitting = true; |
| BC.HasWarmSection = true; |
| break; |
| case SplitFunctionsStrategy::Profile2: |
| Strategy = std::make_unique<SplitProfile2>(); |
| break; |
| case SplitFunctionsStrategy::Random2: |
| Strategy = std::make_unique<SplitRandom2>(); |
| // If we split functions randomly, we need to ensure that across runs with |
| // the same input, we generate random numbers for each function in the same |
| // order. |
| ForceSequential = true; |
| break; |
| case SplitFunctionsStrategy::RandomN: |
| Strategy = std::make_unique<SplitRandomN>(); |
| ForceSequential = true; |
| break; |
| case SplitFunctionsStrategy::All: |
| Strategy = std::make_unique<SplitAll>(); |
| break; |
| } |
| |
| ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { |
| return !shouldOptimize(BF); |
| }; |
| |
| ParallelUtilities::runOnEachFunction( |
| BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, |
| [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc, |
| "SplitFunctions", ForceSequential); |
| |
| if (SplitBytesHot + SplitBytesCold > 0) |
| BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot |
| << " hot bytes from " << SplitBytesCold << " cold bytes " |
| << format("(%.2lf%% of split functions is hot).\n", |
| 100.0 * SplitBytesHot / |
| (SplitBytesHot + SplitBytesCold)); |
| return Error::success(); |
| } |
| |
| void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { |
| if (BF.empty()) |
| return; |
| |
| if (!S.canSplit(BF)) |
| return; |
| |
| FunctionLayout &Layout = BF.getLayout(); |
| BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), |
| Layout.block_end()); |
| |
| BinaryContext &BC = BF.getBinaryContext(); |
| size_t OriginalHotSize; |
| size_t HotSize; |
| size_t ColdSize; |
| if (BC.isX86()) { |
| std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); |
| LLVM_DEBUG(dbgs() << "Estimated size for function " << BF |
| << " pre-split is <0x" |
| << Twine::utohexstr(OriginalHotSize) << ", 0x" |
| << Twine::utohexstr(ColdSize) << ">\n"); |
| } |
| |
| BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), |
| Layout.block_end()); |
| // Never outline the first basic block. |
| NewLayout.front()->setCanOutline(false); |
| for (BinaryBasicBlock *const BB : NewLayout) { |
| if (!BB->canOutline()) |
| continue; |
| |
| // Do not split extra entry points in aarch64. They can be referred by |
| // using ADRs and when this happens, these blocks cannot be placed far |
| // away due to the limited range in ADR instruction. |
| if (BC.isAArch64() && BB->isEntryPoint()) { |
| BB->setCanOutline(false); |
| continue; |
| } |
| |
| if (BF.hasEHRanges() && !opts::SplitEH) { |
| // We cannot move landing pads (or rather entry points for landing pads). |
| if (BB->isLandingPad()) { |
| BB->setCanOutline(false); |
| continue; |
| } |
| // We cannot move a block that can throw since exception-handling |
| // runtime cannot deal with split functions. However, if we can guarantee |
| // that the block never throws, it is safe to move the block to |
| // decrease the size of the function. |
| for (MCInst &Instr : *BB) { |
| if (BC.MIB->isInvoke(Instr)) { |
| BB->setCanOutline(false); |
| break; |
| } |
| } |
| } |
| |
| // Outlining blocks with dynamic branches is not supported yet. |
| if (BC.IsLinuxKernel) { |
| if (llvm::any_of( |
| *BB, [&](MCInst &Inst) { return BC.MIB->isDynamicBranch(Inst); })) |
| BB->setCanOutline(false); |
| } |
| } |
| |
| BF.getLayout().updateLayoutIndices(); |
| S.fragment(NewLayout.begin(), NewLayout.end()); |
| |
| // Make sure all non-outlineable blocks are in the main-fragment. |
| for (BinaryBasicBlock *const BB : NewLayout) { |
| if (!BB->canOutline()) |
| BB->setFragmentNum(FragmentNum::main()); |
| } |
| |
| if (opts::AggressiveSplitting) { |
| // All blocks with 0 count that we can move go to the end of the function. |
| // Even if they were natural to cluster formation and were seen in-between |
| // hot basic blocks. |
| llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A, |
| const BinaryBasicBlock *const B) { |
| return A->getFragmentNum() < B->getFragmentNum(); |
| }); |
| } else if (BF.hasEHRanges() && !opts::SplitEH) { |
| // Typically functions with exception handling have landing pads at the end. |
| // We cannot move beginning of landing pads, but we can move 0-count blocks |
| // comprising landing pads to the end and thus facilitate splitting. |
| auto FirstLP = NewLayout.begin(); |
| while ((*FirstLP)->isLandingPad()) |
| ++FirstLP; |
| |
| std::stable_sort(FirstLP, NewLayout.end(), |
| [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { |
| return A->getFragmentNum() < B->getFragmentNum(); |
| }); |
| } |
| |
| // Make sure that fragments are increasing. |
| FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum(); |
| for (BinaryBasicBlock *const BB : reverse(NewLayout)) { |
| if (BB->getFragmentNum() > CurrentFragment) |
| BB->setFragmentNum(CurrentFragment); |
| CurrentFragment = BB->getFragmentNum(); |
| } |
| |
| if (S.compactFragments()) { |
| FragmentNum CurrentFragment = FragmentNum::main(); |
| FragmentNum NewFragment = FragmentNum::main(); |
| for (BinaryBasicBlock *const BB : NewLayout) { |
| if (BB->getFragmentNum() > CurrentFragment) { |
| CurrentFragment = BB->getFragmentNum(); |
| NewFragment = FragmentNum(NewFragment.get() + 1); |
| } |
| BB->setFragmentNum(NewFragment); |
| } |
| } |
| |
| const bool LayoutUpdated = BF.getLayout().update(NewLayout); |
| |
| // For shared objects, invoke instructions and corresponding landing pads |
| // have to be placed in the same fragment. When we split them, create |
| // trampoline landing pads that will redirect the execution to real LPs. |
| TrampolineSetType Trampolines; |
| if (BF.hasEHRanges() && BF.isSplit()) { |
| // If all landing pads for this fragment are grouped in one (potentially |
| // different) fragment, we can set LPStart to the start of that fragment |
| // and avoid trampoline code. |
| bool NeedsTrampolines = false; |
| for (FunctionFragment &FF : BF.getLayout().fragments()) { |
| // Vector of fragments that contain landing pads for this fragment. |
| SmallVector<FragmentNum, 4> LandingPadFragments; |
| for (const BinaryBasicBlock *BB : FF) |
| for (const BinaryBasicBlock *LPB : BB->landing_pads()) |
| LandingPadFragments.push_back(LPB->getFragmentNum()); |
| |
| // Eliminate duplicate entries from the vector. |
| llvm::sort(LandingPadFragments); |
| auto Last = llvm::unique(LandingPadFragments); |
| LandingPadFragments.erase(Last, LandingPadFragments.end()); |
| |
| if (LandingPadFragments.size() == 0) { |
| // If the fragment has no landing pads, we can safely set itself as its |
| // landing pad fragment. |
| BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum()); |
| } else if (LandingPadFragments.size() == 1) { |
| BF.setLPFragment(FF.getFragmentNum(), LandingPadFragments.front()); |
| } else { |
| if (!BC.HasFixedLoadAddress) { |
| NeedsTrampolines = true; |
| break; |
| } else { |
| BF.setLPFragment(FF.getFragmentNum(), std::nullopt); |
| } |
| } |
| } |
| |
| // Trampolines guarantee that all landing pads for any given fragment will |
| // be contained in the same fragment. |
| if (NeedsTrampolines) { |
| for (FunctionFragment &FF : BF.getLayout().fragments()) |
| BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum()); |
| Trampolines = createEHTrampolines(BF); |
| } |
| } |
| |
| // Check the new size to see if it's worth splitting the function. |
| if (BC.isX86() && LayoutUpdated) { |
| std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); |
| LLVM_DEBUG(dbgs() << "Estimated size for function " << BF |
| << " post-split is <0x" << Twine::utohexstr(HotSize) |
| << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); |
| if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= |
| alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { |
| if (opts::Verbosity >= 2) { |
| BC.outs() << "BOLT-INFO: Reversing splitting of function " |
| << formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF, HotSize, |
| ColdSize, OriginalHotSize); |
| } |
| |
| // Reverse the action of createEHTrampolines(). The trampolines will be |
| // placed immediately before the matching destination resulting in no |
| // extra code. |
| if (PreSplitLayout.size() != BF.size()) |
| PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); |
| |
| for (BinaryBasicBlock &BB : BF) |
| BB.setFragmentNum(FragmentNum::main()); |
| BF.getLayout().update(PreSplitLayout); |
| } else { |
| SplitBytesHot += HotSize; |
| SplitBytesCold += ColdSize; |
| } |
| } |
| |
| // Restore LP fragment for the main fragment if the splitting was undone. |
| if (BF.hasEHRanges() && !BF.isSplit()) |
| BF.setLPFragment(FragmentNum::main(), FragmentNum::main()); |
| |
| // Fix branches if the splitting decision of the pass after function |
| // reordering is different from that of the pass before function reordering. |
| if (LayoutUpdated && BC.HasFinalizedFunctionOrder) |
| BF.fixBranches(); |
| } |
| |
| SplitFunctions::TrampolineSetType |
| SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { |
| const auto &MIB = BF.getBinaryContext().MIB; |
| |
| // Map real landing pads to the corresponding trampolines. |
| TrampolineSetType LPTrampolines; |
| |
| // Iterate over the copy of basic blocks since we are adding new blocks to the |
| // function which will invalidate its iterators. |
| std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); |
| for (BinaryBasicBlock *BB : Blocks) { |
| for (MCInst &Instr : *BB) { |
| const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); |
| if (!EHInfo || !EHInfo->first) |
| continue; |
| |
| const MCSymbol *LPLabel = EHInfo->first; |
| BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); |
| if (BB->getFragmentNum() == LPBlock->getFragmentNum()) |
| continue; |
| |
| const MCSymbol *TrampolineLabel = nullptr; |
| const TrampolineKey Key(BB->getFragmentNum(), LPLabel); |
| auto Iter = LPTrampolines.find(Key); |
| if (Iter != LPTrampolines.end()) { |
| TrampolineLabel = Iter->second; |
| } else { |
| // Create a trampoline basic block in the same fragment as the thrower. |
| // Note: there's no need to insert the jump instruction, it will be |
| // added by fixBranches(). |
| BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); |
| TrampolineBB->setFragmentNum(BB->getFragmentNum()); |
| TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); |
| TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); |
| TrampolineBB->setCFIState(LPBlock->getCFIState()); |
| TrampolineLabel = TrampolineBB->getLabel(); |
| LPTrampolines.insert(std::make_pair(Key, TrampolineLabel)); |
| } |
| |
| // Substitute the landing pad with the trampoline. |
| MIB->updateEHInfo(Instr, |
| MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); |
| } |
| } |
| |
| if (LPTrampolines.empty()) |
| return LPTrampolines; |
| |
| // All trampoline blocks were added to the end of the function. Place them at |
| // the end of corresponding fragments. |
| BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), |
| BF.getLayout().block_end()); |
| stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { |
| return A->getFragmentNum() < B->getFragmentNum(); |
| }); |
| BF.getLayout().update(NewLayout); |
| |
| // Conservatively introduce branch instructions. |
| BF.fixBranches(); |
| |
| // Update exception-handling CFG for the function. |
| BF.recomputeLandingPads(); |
| |
| return LPTrampolines; |
| } |
| |
| SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( |
| BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, |
| const SplitFunctions::TrampolineSetType &Trampolines) const { |
| DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>> |
| IncomingTrampolines; |
| for (const auto &Entry : Trampolines) { |
| IncomingTrampolines[Entry.getFirst().Target].emplace_back( |
| Entry.getSecond()); |
| } |
| |
| BasicBlockOrderType MergedLayout; |
| for (BinaryBasicBlock *BB : Layout) { |
| auto Iter = IncomingTrampolines.find(BB->getLabel()); |
| if (Iter != IncomingTrampolines.end()) { |
| for (const MCSymbol *const Trampoline : Iter->getSecond()) { |
| BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline); |
| assert(LPBlock && "Could not find matching landing pad block."); |
| MergedLayout.push_back(LPBlock); |
| } |
| } |
| MergedLayout.push_back(BB); |
| } |
| |
| return MergedLayout; |
| } |
| |
| } // namespace bolt |
| } // namespace llvm |