Skip to content

[CodeGen][NPM] Support CodeGenSCCOrder in pipeline #136818

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 78 additions & 21 deletions llvm/include/llvm/Passes/CodeGenPassBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/Analysis/TargetTransformInfo.h"
Expand Down Expand Up @@ -210,10 +211,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
class AddIRPass {
public:
AddIRPass(ModulePassManager &MPM, const DerivedT &PB) : MPM(MPM), PB(PB) {}
~AddIRPass() {
if (!FPM.isEmpty())
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
}
~AddIRPass() { flushFPMToMPM(); }

template <typename PassT>
void operator()(PassT &&Pass, StringRef Name = PassT::name()) {
Expand All @@ -231,16 +229,40 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
FPM.addPass(std::forward<PassT>(Pass));
} else {
// Add Module Pass
if (!FPM.isEmpty()) {
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
FPM = FunctionPassManager();
}

flushFPMToMPM();
MPM.addPass(std::forward<PassT>(Pass));
}
}

/// Setting this will add passes to the CGSCC pass manager.
void requireCGSCCOrder() {
if (PB.AddInCGSCCOrder)
return;
flushFPMToMPM();
PB.AddInCGSCCOrder = true;
}

/// Stop adding passes to the CGSCC pass manager.
/// Existing passes won't be removed.
void stopAddingInCGSCCOrder() {
if (!PB.AddInCGSCCOrder)
return;
flushFPMToMPM();
PB.AddInCGSCCOrder = false;
}

private:
void flushFPMToMPM() {
if (FPM.isEmpty())
return;
if (PB.AddInCGSCCOrder) {
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
createCGSCCToFunctionPassAdaptor(std::move(FPM))));
} else {
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
}
FPM = FunctionPassManager();
}
ModulePassManager &MPM;
FunctionPassManager FPM;
const DerivedT &PB;
Expand All @@ -252,13 +274,17 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
AddMachinePass(ModulePassManager &MPM, const DerivedT &PB)
: MPM(MPM), PB(PB) {}
~AddMachinePass() {
if (!MFPM.isEmpty()) {
FunctionPassManager FPM;
FPM.addPass(
createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
if (MFPM.isEmpty())
return;

FunctionPassManager FPM;
FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
if (this->PB.AddInCGSCCOrder) {
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
createCGSCCToFunctionPassAdaptor(std::move(FPM))));
} else
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
}
}

template <typename PassT>
Expand All @@ -276,20 +302,47 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
MFPM.addPass(std::forward<PassT>(Pass));
} else {
// Add Module Pass
if (!MFPM.isEmpty()) {
MPM.addPass(createModuleToFunctionPassAdaptor(
createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))));
MFPM = MachineFunctionPassManager();
}

flushMFPMToMPM();
MPM.addPass(std::forward<PassT>(Pass));
}

for (auto &C : PB.AfterCallbacks)
C(Name, MFPM);
}

/// Setting this will add passes to the CGSCC pass manager.
void requireCGSCCOrder() {
if (PB.AddInCGSCCOrder)
return;
flushMFPMToMPM();
PB.AddInCGSCCOrder = true;
}

/// Stop adding passes to the CGSCC pass manager.
/// Existing passes won't be removed.
void stopAddingInCGSCCOrder() {
if (!PB.AddInCGSCCOrder)
return;
flushMFPMToMPM();
PB.AddInCGSCCOrder = false;
}

private:
void flushMFPMToMPM() {
if (MFPM.isEmpty())
return;

if (PB.AddInCGSCCOrder) {
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
createCGSCCToFunctionPassAdaptor(
createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)))));
} else {
MPM.addPass(createModuleToFunctionPassAdaptor(
createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))));
}
MFPM = MachineFunctionPassManager();
}

ModulePassManager &MPM;
MachineFunctionPassManager MFPM;
const DerivedT &PB;
Expand Down Expand Up @@ -555,6 +608,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
/// Helper variable for `-start-before/-start-after/-stop-before/-stop-after`
mutable bool Started = true;
mutable bool Stopped = true;
mutable bool AddInCGSCCOrder = false;
};

template <typename Derived, typename TargetMachineT>
Expand Down Expand Up @@ -813,6 +867,9 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addISelPrepare(
AddIRPass &addPass) const {
derived().addPreISel(addPass);

if (Opt.RequiresCodeGenSCCOrder)
addPass.requireCGSCCOrder();

addPass(CallBrPreparePass());
// Add both the safe stack and the stack protection passes: each of them will
// only protect functions that have corresponding attributes.
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2079,6 +2079,8 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));

addPass.requireCGSCCOrder();

Base::addCodeGenPrepare(addPass);

if (isPassEnabled(EnableLoadStoreVectorizer))
Expand Down
21 changes: 21 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
; RUN: llc -enable-new-pm -mtriple=amdgcn--amdhsa -O0 -print-pipeline-passes < %s 2>&1 \
; RUN: | FileCheck -check-prefix=GCN-O0 %s

; RUN: llc -enable-new-pm -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
; RUN: | FileCheck -check-prefix=GCN-O2 %s

; RUN: llc -O3 -enable-new-pm -mtriple=amdgcn--amdhsa -print-pipeline-passes < %s 2>&1 \
; RUN: | FileCheck -check-prefix=GCN-O3 %s


; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,UnreachableBlockElimPass,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,AMDGPUWaitSGPRHazardsPass,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))


; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(infer-address-spaces,amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,consthoist,ReplaceWithVeclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,early-cse<>,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,UnreachableBlockElimPass,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions,detect-dead-lanes,InitUndefPass,ProcessImplicitDefsPass,unreachable-mbb-elimination,require<live-vars>,require<machine-loops>,phi-node-elimination,two-address-instruction,register-coalescer,rename-independent-subregs,machine-scheduler,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,PostRAMachineSinkingPass,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,AMDGPUWaitSGPRHazardsPass,amdgpu-insert-delay-alu,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))

; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(infer-address-spaces,amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,consthoist,ReplaceWithVeclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,gvn<>,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,UnreachableBlockElimPass,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions,detect-dead-lanes,InitUndefPass,ProcessImplicitDefsPass,unreachable-mbb-elimination,require<live-vars>,require<machine-loops>,phi-node-elimination,two-address-instruction,register-coalescer,rename-independent-subregs,machine-scheduler,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,PostRAMachineSinkingPass,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,AMDGPUWaitSGPRHazardsPass,amdgpu-insert-delay-alu,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))


define void @empty() {
ret void
}
Loading